|
- # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- The file structure is as following:
- abdomen
- |--RawData.zip
- |--abdomen_raw
- │ ├── RawData
- │ │ ├──RawData
- │ │ │ ├── Training
- │ │ │ │ ├── img
- │ │ │ │ │ ├── img0001.nii.gz
- │ │ │ │ │ └── ...
- │ │ │ │ └── ...
- │ │ │ │ ├── label
- │ │ │ │ │ ├── img0001.nii.gz
- │ │ │ │ │ └── ...
- │ │ │ │ └── ...
- ├── abdomen_phase0
- │ ├── images
- │ │ ├── img0001-0001.npy
- │ │ └── ...
- │ ├── labels
- │ │ ├── label0001-0001.npy
- │ │ └── ...
- │ ├── train_list.txt
- │ └── val_list.txt
- support:
- 1. download and uncompress the file.
- 2. save the data as the above format.
- 3. split the training data and save the split result in train_list.txt and val_list.txt
-
- """
- import os
- import sys
-
- sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), ".."))
-
- import os.path as osp
- import time
- import json
-
- import numpy as np
- from tqdm import tqdm
-
- from prepare import Prep
- from preprocess_utils import HUnorm, ignore_label
- from medicalseg.utils import wrapped_partial
-
- urls = {"Reg-Training-Training.zip": ""}
-
- label_map = {
- "0": 0,
- "1": 1,
- "2": 2,
- "3": 3,
- "4": 4,
- "5": 0,
- "6": 5,
- "7": 6,
- "8": 7,
- "9": 0,
- "10": 0,
- "11": 8,
- "12": 0,
- "13": 0,
- }
-
-
- class Prep_abdomen(Prep):
- def __init__(self):
- super().__init__(
- dataset_root="data/abdomen",
- raw_dataset_dir="abdomen_raw/",
- images_dir="RawData/RawData/Training/img",
- labels_dir="RawData/RawData/Training/label",
- phase_dir="abdomen_phase0/",
- urls=urls,
- valid_suffix=("nii.gz", "nii.gz"),
- filter_key=(None, None),
- uncompress_params={"format": "zip",
- "num_files": 1})
-
- self.preprocess = {
- "images": [
- wrapped_partial(
- np.clip, a_min=-125, a_max=275), wrapped_partial(
- HUnorm, HU_min=-125, HU_max=275, multiply_255=False)
- ],
- "labels": [wrapped_partial(
- ignore_label, label_map=label_map)]
- }
- self.train_image_files = []
- self.val_image_files = []
- self.train_label_files = []
- self.val_label_files = []
-
- self.train_image_files_npy = []
- self.val_image_files_npy = []
- self.train_label_files_npy = []
- self.val_label_files_npy = []
- self.train_val_split()
-
- def load_save(self, mode='train'):
- """
- preprocess files, transfer to the correct type, and save it to the directory.
- """
- print(
- "Start convert {} images to numpy array using {}, please wait patiently"
- .format(mode, self.gpu_tag))
-
- tic = time.time()
- if mode == 'train':
- process_files = (self.train_image_files, self.train_label_files)
- target_files = (self.train_image_files_npy,
- self.train_label_files_npy)
- else:
- process_files = (self.val_image_files, self.val_label_files)
- target_files = (self.val_image_files_npy, self.val_label_files_npy)
- process_tuple = ("images", "labels")
-
- save_tuple = (self.image_path, self.label_path)
-
- for i, files in enumerate(process_files):
- pre = self.preprocess[process_tuple[i]]
- savepath = save_tuple[i]
-
- for f in tqdm(
- files,
- total=len(files),
- desc="preprocessing the {}".format(["images", "labels"][
- i])):
-
- f_nps = Prep.load_medical_data(f)[0]
- # xyz to zxy
- f_nps = f_nps.transpose(2, 0, 1)
- if mode == 'train':
- for volume_idx, f_np in enumerate(f_nps):
- for op in pre:
- f_np = op(f_np)
- filename = osp.basename(f).split(".")[
- 0] + f"-{volume_idx:>04d}.npy"
- f_np_name = os.path.join(savepath, filename)
- np.save(f_np_name, f_np)
- target_files[i].append(filename)
- else:
- for op in pre:
- f_nps = op(f_nps)
- filename = osp.basename(f).split(".")[0] + ".npy"
- f_np_name = os.path.join(savepath, filename)
- np.save(f_np_name, f_nps)
- target_files[i].append(filename)
-
- print("The preprocess time on {} is {}".format(self.gpu_tag,
- time.time() - tic))
-
- def generate_txt(self, train_split=0.6):
- """generate the train_list.txt and val_list.txt"""
-
- txtname = [
- os.path.join(self.phase_path, 'train_list.txt'),
- os.path.join(self.phase_path, 'val_list.txt')
- ]
- self.write_txt(txtname[0], self.train_image_files_npy,
- self.train_label_files_npy)
- self.write_txt(txtname[1], self.val_image_files_npy,
- self.val_label_files_npy)
-
- def train_val_split(self, train_split=0.6):
- image_files = np.array(self.image_files)
- label_files = np.array(self.label_files)
- np.random.seed(0)
- state = np.random.get_state()
- np.random.shuffle(image_files)
- np.random.set_state(state)
- np.random.shuffle(label_files)
- train_len = round(len(self.image_files) * train_split)
- self.train_image_files = self.image_files[:train_len]
- self.val_image_files = self.image_files[train_len:]
- self.train_label_files = self.label_files[:train_len]
- self.val_label_files = self.label_files[train_len:]
-
-
- if __name__ == "__main__":
- prep = Prep_abdomen()
- prep.generate_dataset_json(
- modalities=('CT', ),
- labels={
- 0: 'background',
- 1: 'spleen',
- 2: 'right kidney',
- 3: 'left kidney',
- 4: 'gallbladder',
- 5: 'liver',
- 6: 'stomach',
- 7: 'aorta',
- 8: 'pancreas'
- },
- dataset_name="Abdomen CT scans",
- dataset_description="Under Institutional Review Board (IRB) supervision, 50 abdomen CT scans of were randomly selected from a combination of an ongoing colorectal cancer chemotherapy trial, and a retrospective ventral hernia study.",
- license_desc="https://creativecommons.org/licenses/by/4.0/legalcode",
- dataset_reference="https://www.synapse.org/#!Synapse:syn3193805/wiki/89480",
- )
- prep.load_save(mode='train')
- prep.load_save(mode='val')
- prep.generate_txt()
|