|
- # Copyright 2022 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
-
- import librosa
- import argparse
- import numpy as np
- from tasnet import TasNet
- from mir_eval.separation import bss_eval_sources
- from data import DatasetGenerator
- # from function import get_input_with_list
- import mindspore
- import mindspore.dataset as ds
- import mindspore.ops as ops
- from mindspore import context
- from mindspore import load_checkpoint, load_param_into_net
- import os
- import zipfile
- import argparse
- import json
- import moxing as mox
- from mindspore import nn
- from mindspore.train import Model
- from mindspore.train.callback import LossMonitor, TimeMonitor, ModelCheckpoint, CheckpointConfig
- from mindspore.profiler import Profiler
- import time
- from mindspore import load_checkpoint, load_param_into_net
- from mindspore import Tensor, set_seed, Parameter
-
- parser = argparse.ArgumentParser('Evaluate separation performance using TasNet')
- parser.add_argument('--in-dir', type=str, default=r"/home/work/user-job-dir/inputs/data/",
- help='Directory path of wsj0 including tr, cv and tt')
- parser.add_argument('--out-dir', type=str, default=r"/home/work/user-job-dir/inputs/data_json",
- help='Directory path to put output files')
- parser.add_argument('--sample-rate', type=int, default=8000,
- help='Sample rate of audio file')
- parser.add_argument('--data_url',
- help='path to training/inference dataset folder',
- default='./data')
-
- parser.add_argument('--train_url',
- help='model folder to save/load',
- default='./model')
- parser.add_argument('--device_target',
- type=str,
- default="Ascend",
- choices=['Ascend', 'GPU', 'CPU'],
- help='device where the code will be implemented (default: Ascend)')
- # parser.add_argument('--model_path', type=str,
- # default="/home/fanruibo/Project/Tasnet_Lstm/checkpoint/Train_525_XavierUniform_ckpt-21_3462.ckpt",
- # help='Path to model file created by training')
- parser.add_argument('--model_path', type=str,
- default="Sunhy730_Train-10_695.ckpt",
- help='Path to model file created by training')
- parser.add_argument('--data_dir', type=str,
- default=r"/home/work/user-job-dir/inputs/data_json/test",
- help='directory including mix.json, s1.json and s2.json')
- parser.add_argument('--cal_sdr', type=int, default=0,
- help='Whether calculate SDR, add this option because calculation of SDR is very slow')
- parser.add_argument('--use_cuda', type=int, default=0,
- help='Whether use GPU')
- parser.add_argument('--sample_rate', default=8000, type=int,
- help='Sample rate')
- parser.add_argument('--batch_size', default=1, type=int,
- help='Batch size')
-
- # Network architecture
- parser.add_argument('--L', default=40, type=int,
- help='Segment length (40=5ms at 8kHZ)')
- parser.add_argument('--N', default=500, type=int,
- help='The number of basis signals')
- parser.add_argument('--hidden_size', default=512, type=int,
- help='Number of LSTM hidden units')
- parser.add_argument('--num_layers', default=4, type=int,
- help='Number of LSTM layers')
- parser.add_argument('--bidirectional', default=0, type=int,
- help='Whether use bidirectional LSTM')
- parser.add_argument('--nspk', default=2, type=int,
- help='Number of speaker')
-
-
- def preprocess_one_dir(in_dir, out_dir, out_filename, sample_rate=8000):
- """
- sample_rate: 8000
- Read the wav file and save the path and len to the json file
- """
- file_infos = []
- in_dir = os.path.abspath(in_dir)
- wav_list = os.listdir(in_dir)
- for wav_file in wav_list:
- if not wav_file.endswith('.wav'):
- continue
- wav_path = os.path.join(in_dir, wav_file)
- samples, _ = librosa.load(wav_path, sr=sample_rate)
- # if len(samples) > 128000:
- # continue
- file_infos.append((wav_path, len(samples)))
- if not os.path.exists(out_dir):
- os.makedirs(out_dir)
- with open(os.path.join(out_dir, out_filename + '.json'), 'w') as f:
- json.dump(file_infos, f, indent=4)
-
-
- def preprocess(args):
- """ Process all files """
- print("Begin preprocess")
- for data_type in ['test']:
- for speaker in ['mix', 's1', 's2']:
- preprocess_one_dir(os.path.join(args.in_dir, data_type, speaker),
- os.path.join(args.out_dir, data_type),
- speaker,
- sample_rate=args.sample_rate)
- print("Preprocess done")
-
-
- def evaluate(args):
- total_SISNRi = 0
- total_SDRi = 0
- total_cnt = 0
-
- print("Start copying data!!")
- ######################## 将数据集从obs拷贝到训练镜像中 (固定写法)########################
- # 在训练环境中定义data_url和train_url,并把数据从obs拷贝到相应的固定路径
- obs_data_url = args.data_url
- args.data_url = '/home/work/user-job-dir/inputs/data/'
- obs_train_url = args.train_url
- args.train_url = '/home/work/user-job-dir/outputs/model/'
- try:
- mox.file.copy_parallel(obs_data_url, args.data_url)
- print("Successfully Download {} to {}".format(obs_data_url,
- args.data_url))
- except Exception as e:
- print('moxing download {} to {} failed: '.format(
- obs_data_url, args.data_url) + str(e))
-
- # unzip(args.zip_in_dir, args.zip_out_dir)
- preprocess(args)
-
- set_seed(1)
-
- # Load model
- model = TasNet(args.L, args.N, args.hidden_size, args.num_layers,
- bidirectional=bool(args.bidirectional), nspk=args.nspk)
- model.set_train(mode=False)
- home = os.path.dirname(os.path.realpath(__file__))
- ckpt = os.path.join(home, args.model_path)
- print('=====> load params into generator')
- params = load_checkpoint(ckpt)
- load_param_into_net(model, params)
- print('=====> finish load generator')
- print(model)
-
- # Load data
- tt_dataset = DatasetGenerator(args.data_dir, args.batch_size,
- sample_rate=args.sample_rate, L=args.L)
- tt_loader = ds.GeneratorDataset(tt_dataset, ["mixture", "lens", "sources"], shuffle=False)
- tt_loader = tt_loader.batch(batch_size=args.batch_size)
-
- for data in tt_loader.create_dict_iterator():
- padded_mixture = data["mixture"]
- mixture_lengths = data["lens"]
- padded_source = data["sources"]
- padded_mixture = ops.Cast()(padded_mixture, mindspore.float32)
- padded_source = ops.Cast()(padded_source, mindspore.float32)
- # mixture_lengths_with_list = get_input_with_list(args.data_dir)
- estimate_source = model(padded_mixture)
- from Loss import Loss
- my_loss = Loss()
- loss, max_snr, estimate_source, reorder_estimate_source = \
- my_loss(padded_source, estimate_source, mixture_lengths)
- # Remove padding and flat
- # mixture = remove_pad_and_flat(padded_mixture, mixture_lengths_with_list)
- # source = remove_pad_and_flat(padded_source, mixture_lengths_with_list)
- mixture = remove_pad_and_flat(padded_mixture)
- source = remove_pad_and_flat(padded_source)
- # NOTE: use reorder estimate source
- estimate_source = remove_pad_and_flat(reorder_estimate_source)
- # mixture_lengths_with_list)
- for mix, src_ref, src_est in zip(mixture, source, estimate_source):
- print("Utt", total_cnt + 1)
- # Compute SDRi
- if args.cal_sdr:
- avg_SDRi = cal_SDRi(src_ref, src_est, mix)
- total_SDRi += avg_SDRi
- print("\tSDRi={0:.2f}".format(avg_SDRi))
- # Compute SI-SNRi
- avg_SISNRi = cal_SISNRi(src_ref, src_est, mix)
- print("\tSI-SNRi={0:.2f}".format(avg_SISNRi))
- total_SISNRi += avg_SISNRi
- total_cnt += 1
- if args.cal_sdr:
- print("Average SDR improvement: {0:.2f}".format(total_SDRi / total_cnt))
- print("Average SISNR improvement: {0:.2f}".format(total_SISNRi / total_cnt))
-
-
- def cal_SDRi(src_ref, src_est, mix):
- """Calculate Source-to-Distortion Ratio improvement (SDRi).
- NOTE: bss_eval_sources is very very slow.
- Args:
- src_ref: numpy.ndarray, [C, T]
- src_est: numpy.ndarray, [C, T], reordered by best PIT permutation
- mix: numpy.ndarray, [T]
- Returns:
- average_SDRi
- """
- src_anchor = np.stack([mix, mix], axis=0)
- sdr, sir, sar, popt = bss_eval_sources(src_ref, src_est)
- sdr0, sir0, sar0, popt0 = bss_eval_sources(src_ref, src_anchor)
- avg_SDRi = ((sdr[0]-sdr0[0]) + (sdr[1]-sdr0[1])) / 2
- # print("SDRi1: {0:.2f}, SDRi2: {1:.2f}".format(sdr[0]-sdr0[0], sdr[1]-sdr0[1]))
- return avg_SDRi
-
-
- def cal_SISNRi(src_ref, src_est, mix):
- """Calculate Scale-Invariant Source-to-Noise Ratio improvement (SI-SNRi)
- Args:
- src_ref: numpy.ndarray, [C, T]
- src_est: numpy.ndarray, [C, T], reordered by best PIT permutation
- mix: numpy.ndarray, [T]
- Returns:
- average_SISNRi
- """
- sisnr1 = cal_SISNR(src_ref[0], src_est[0])
- sisnr2 = cal_SISNR(src_ref[1], src_est[1])
- sisnr1b = cal_SISNR(src_ref[0], mix)
- sisnr2b = cal_SISNR(src_ref[1], mix)
- avg_SISNRi = ((sisnr1 - sisnr1b) + (sisnr2 - sisnr2b)) / 2
- return avg_SISNRi
-
-
- def cal_SISNR(ref_sig, out_sig, eps=1e-8):
- """Calcuate Scale-Invariant Source-to-Noise Ratio (SI-SNR)
- Args:
- ref_sig: numpy.ndarray, [T]
- out_sig: numpy.ndarray, [T]
- Returns:
- SISNR
- """
- assert len(ref_sig) == len(out_sig)
- ref_sig = ref_sig - np.mean(ref_sig)
- out_sig = out_sig - np.mean(out_sig)
- ref_energy = np.sum(ref_sig ** 2) + eps
- proj = np.sum(ref_sig * out_sig) * ref_sig / ref_energy
- noise = out_sig - proj
- ratio = np.sum(proj ** 2) / (np.sum(noise ** 2) + eps)
- sisnr = 10 * np.log(ratio + eps) / np.log(10.0)
- return sisnr
-
-
- def remove_pad_and_flat(inputs):
- """
- Args:
- inputs: Tensor, [B, C, K, L] or [B, K, L]
- inputs_lengths: Tensor, [B]
- Returns:
- results: a list containing B items, each item is [C, T], T varies
- """
- results = []
- dim = inputs.ndim
- if dim == 4:
- C = inputs.shape[1]
- for i, input in enumerate(inputs):
- if dim == 4: # [B, C, K, L]
- results.append(input[:, :3320].view(C, -1).asnumpy())
- elif dim == 3: # [B, K, L]
- results.append(input[:3320].view(-1).asnumpy())
- return results
-
-
- if __name__ == '__main__':
- print("*+*+" * 100)
- args = parser.parse_args()
- print(args)
- context.set_context(mode=context.PYNATIVE_MODE, device_target=args.device_target)
- evaluate(args)
|