|
- ###################### 实现将数据增强后输出的文件压缩保存 ##############################
- import zipfile
- import os
-
- # 指定要压缩的目录
- dir_path = "/model/B/C"
-
- # 获取目录下所有文件的路径列表
- file_paths = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
-
- # 创建一个zip文件对象
- zip_file = zipfile.ZipFile("/model/logmel_val.zip", "w")
-
- # 将所有文件添加到zip文件中
- for file in file_paths:
- zip_file.write(file)
-
- # 关闭zip文件
- zip_file.close()
- print("################# success #################")
-
- #============================ 计算 add random noise 后的 logmel =======================================#
- import os
- import numpy as np
- import scipy.io
- import pandas as pd
- import librosa
- import pickle
- import soundfile as sound
- from multiprocessing import Pool
-
-
- data_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = data_path + 'evaluation_setup/fold1_train.csv'
- output_path = '/model/Chen/features/logmel128_scaled_noise'
- feature_type = 'logmel'
-
- sr = 44100
- duration = 10
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft/2)
- num_time_bin = int(np.ceil(duration*sr/hop_length))
- num_channel = 1
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
- data_df = pd.read_csv(csv_file, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
-
-
- for i in range(len(wavpath)):
- stereo, fs = sound.read(data_path + wavpath[i], stop=duration*sr)
- noise = np.random.normal(0,1,len(stereo))
- augmented_data = np.where(stereo != 0.0, stereo.astype('float64') + 0.01 * noise, 0.0).astype(np.float32)
- stereo = augmented_data
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0]= librosa.feature.melspectrogram(y=stereo[:],
- sr=sr,
- n_fft=num_fft,
- hop_length=hop_length,
- n_mels=num_freq_bin,
- fmin=0.0,
- fmax=sr/2,
- htk=True,
- norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + wavpath[i][5:-4] + '-noise.' + feature_type
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
- if i%500 == 499:
- print ("%i/%i samples done" % (i+1, len(wavpath)))
-
-
-
- #============================ 计算 shift pitch 后的 logmel =======================================#
- import os
- import numpy as np
- import scipy.io
- import pandas as pd
- import librosa
- import pickle
- import soundfile as sound
- from multiprocessing import Pool
-
-
- data_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = data_path + 'evaluation_setup/fold1_train.csv'
- output_path = '/model/B/C'
- feature_type = 'logmel'
-
- sr = 44100
- duration = 10
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft/2)
- num_time_bin = int(np.ceil(duration*sr/hop_length))
- num_channel = 1
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
- data_df = pd.read_csv(csv_file, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
-
- for i in range(len(wavpath)):
- stereo, fs = sound.read(data_path + wavpath[i], stop=duration*sr)
- n_step = np.random.uniform(-4, 4)
- y_pitched = librosa.effects.pitch_shift(stereo, sr, n_steps=n_step)
- length = len(stereo)
- stereo = y_pitched
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0]= librosa.feature.melspectrogram(y=stereo[:],
- sr=sr,
- n_fft=num_fft,
- hop_length=hop_length,
- n_mels=num_freq_bin,
- fmin=0.0,
- fmax=sr/2,
- htk=True,
- norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + wavpath[i][5:-4] + '-pitch.' + feature_type
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
- if i%500 == 499:
- print ("%i/%i samples done" % (i+1, len(wavpath)))
-
-
- #============================ 计算 change speed 后的 logmel =======================================#
- import os
- import numpy as np
- import scipy.io
- import pandas as pd
- import librosa
- import pickle
- import soundfile as sound
- from multiprocessing import Pool
-
-
-
- data_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = data_path + 'evaluation_setup/fold1_train.csv'
- output_path = '/model/Chen/features/logmel128_scaled_speed'
- feature_type = 'logmel'
-
- sr = 44100
- duration = 10
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft/2)
- num_time_bin = int(np.ceil(duration*sr/hop_length))
- num_channel = 1
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
- data_df = pd.read_csv(csv_file, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
-
-
- for i in range(len(wavpath)):
- stereo, fs = sound.read(data_path + wavpath[i], stop=duration*sr)
- time_factor = np.random.uniform(0.5, 2)
- length = len(stereo)
- y_stretch = librosa.effects.time_stretch(stereo, time_factor)
- if len(y_stretch) < length:
- y_stretch = np.concatenate((y_stretch, y_stretch))
- y_stretch = y_stretch[0:length]
- else:
- y_stretch = y_stretch[0:length]
- stereo = y_stretch
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0]= librosa.feature.melspectrogram(y=stereo[:],
- sr=sr,
- n_fft=num_fft,
- hop_length=hop_length,
- n_mels=num_freq_bin,
- fmin=0.0,
- fmax=sr/2,
- htk=True,
- norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + wavpath[i][5:-4] + '-speed.' + feature_type
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
- if i%500 == 499:
- print ("%i/%i samples done" % (i+1, len(wavpath)))
-
-
-
- #============================ 计算 mix audio files from the same class 后的 logmel =======================================#
- import os
- import numpy as np
- import scipy.io
- import pandas as pd
- import librosa
- import pickle
- import soundfile as sound
- from multiprocessing import Pool
- from itertools import islice
- import random
-
-
- data_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = data_path + 'evaluation_setup/fold1_train.csv'
- output_path = '/model/B/C'
- feature_type = 'logmel'
- folder_name = data_path + "audio/"
-
- sr = 44100
- duration = 10
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft/2)
- num_time_bin = int(np.ceil(duration*sr/hop_length))
- num_channel = 1
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
- data_df = pd.read_csv(csv_file, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
- label_dict = dict(airport=0, bus=1, metro=2, metro_station=3, park=4, public_square=5, shopping_mall=6, street_pedestrian=7, street_traffic=8, tram=9)
-
- def class_sort():
- class_list = []
- for i in range(10):
- ap = []
- class_list.append(ap)
- with open(csv_file, 'r') as csv_r:
- for line in islice(csv_r, 1, None):
- file_name = line.split('\t')[0].split('/')[1]
- label = line.split('\t')[1].split('\n')[0]
- class_list[label_dict[label]].append(file_name)
-
- return class_list
-
- sample_rate = 44100
- class_list = class_sort()
- for label in class_list:
- length = len(label)
- print(length)
- for file in label:
- y, sr = librosa.load(folder_name + file, mono=True, sr=sample_rate)
- num = random.randint(0, length - 1)
- while file == label[num]:
- num = random.randint(0, length - 1)
- f1, f2 = random.uniform(0.5, 1), random.uniform(0.5, 1)
- y2, _ = librosa.load(folder_name + label[num], mono=True, sr=sample_rate)
- stereo = y * f1 + y2 * f2
-
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0]= librosa.feature.melspectrogram(y=stereo[:],
- sr=sr,
- n_fft=num_fft,
- hop_length=hop_length,
- n_mels=num_freq_bin,
- fmin=0.0,
- fmax=sr/2,
- htk=True,
- norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + '/' + file.split('.')[0] + '-mix.' + feature_type
-
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
- print(label)
-
-
- #============================ 计算源数据的 logmel ========================================#
- import os
- import numpy as np
- import scipy.io
- import pandas as pd
- import librosa
- import pickle
- import soundfile as sound
- from multiprocessing import Pool
-
- data_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = data_path + 'evaluation_setup/fold1_evaluate.csv'
- output_path = '/model/B/C'
- feature_type = 'logmel'
-
- sr = 44100
- duration = 10
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft / 2)
- num_time_bin = int(np.ceil(duration*sr/hop_length))
- num_channel = 1
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
- data_df = pd.read_csv(csv_file, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
-
- for i in range(len(wavpath)):
- stereo, fs = sound.read(data_path + wavpath[i], stop=duration*sr)
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0] = librosa.feature.melspectrogram(y=stereo[:],
- sr=sr,
- n_fft=num_fft,
- hop_length=hop_length,
- n_mels=num_freq_bin,
- fmin=0.0,
- fmax=sr/2,
- htk=True,
- norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + wavpath[i][5:-3] + feature_type
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
- if i%500 == 499:
- print ("%i/%i samples done" % (i+1, len(wavpath)))
-
-
- #============================ 计算 spectrum_correction 后的 logmel ===============================#
- import os
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- import numpy as np
- import pandas as pd
- import librosa
- import soundfile as sound
- import random
- from sklearn.preprocessing import StandardScaler
-
- import pickle
-
-
- file_path = '/dataset/TAU-urban-acoustic-scenes-2020-mobile-development/TAU-urban-acoustic-scenes-2020-mobile-development/'
- csv_file = file_path + 'evaluation_setup/fold1_train.csv'
-
- # device_a_csv = file_path + 'evaluation_setup/fold1_train_a.csv'
- device_a_csv = '/dataset/speccorr/fold1_train_a.csv'
-
- output_path = '/model/B/C'
- feature_type = 'logmel'
-
- if not os.path.exists(output_path):
- os.makedirs(output_path)
-
-
- sr = 44100
- num_audio_channels = 1
- num_channel = 1
- duration = 10
-
- num_freq_bin = 40
- num_fft = 1766
- hop_length = int(num_fft / 2)
- num_time_bin = int(np.ceil(duration * sr / hop_length))
-
- dev_train_df = pd.read_csv(csv_file,sep='\t', encoding='ASCII')
- wavpaths_train = dev_train_df['filename'].tolist()
-
-
- trainf = [x[6:-4] for x in wavpaths_train]
- train_subset = []
-
- for idx in ['-a', '-b', '-c']:
- train_subset.append([x[:-1] for x in trainf if (x.endswith(idx))])
-
- for idx in ['-s1', '-s2', '-s3']:
- train_subset.append([x[:-2] for x in trainf if (x.endswith(idx))])
-
-
- train_sets=[]
- for idx in range(len(train_subset)):
- train_sets.append(set(train_subset[idx]))
-
- paired_wavs = []
- # paired waves in subsets a, b, and c, s1, s2, and s3
- for j in range(1, len(train_sets)):
- # paired waves in subsets b, c, and [s1, s2, s3] -- s4, s5, and s6 are not in the training partition of the data
- paired_wavs.append(train_sets[0] & train_sets[j])
-
- num_paired_wav = [ len(x) for x in paired_wavs]
- min_paired_wav = 150
-
- waves30 = []
- wav_idxs = random.sample(range(min(num_paired_wav)), min_paired_wav)
- for wavs in paired_wavs:
- temp = [list(wavs)[i] for i in wav_idxs]
- waves30.append(temp)
-
-
- nbins_stft = int(np.ceil(num_fft/2.0)+1)
- STFT_all = np.zeros((len(waves30)*min_paired_wav,nbins_stft,num_time_bin),'float32')
-
- for group, x in zip(waves30, ['b', 'c', 's1', 's2', 's3']):
- i = 0
- for sc in group:
- wav_a = 'audio/' + sc + 'a.wav'
- wav_x = 'audio/' + sc + x + '.wav'
- stereo_a, fs = sound.read(file_path + wav_a, stop=duration * sr)
- stereo_x, fs = sound.read(file_path + wav_x, stop=duration * sr)
- # compute STFT of the paired signals
- STFT_a = librosa.stft(stereo_a, n_fft=num_fft, hop_length=hop_length)
- STFT_x = librosa.stft(stereo_x, n_fft=num_fft, hop_length=hop_length)
- # compute average value per each bin
- STFT_ref = np.abs(STFT_x)
- STFT_corr_coeff = STFT_ref/np.abs(STFT_a)
- # stack averaged values
- STFT_all[i,:,:] = STFT_corr_coeff
- i=i+1
-
-
-
- STFT_hstak = np.hstack(STFT_all)
- STFT_corr_coeff = np.expand_dims(np.mean(STFT_hstak,axis=1),-1)
-
- data_df = pd.read_csv(device_a_csv, sep='\t', encoding='ASCII')
- wavpath = data_df['filename'].tolist()
-
-
- #device_list = ['b', 'c', 's1', 's2', 's3']
- for d in range(1):
- for i in range(len(wavpath)):
- stereo, fs = sound.read(file_path + wavpath[i], stop=duration*sr)
- STFT = librosa.stft(stereo, n_fft=num_fft, hop_length=hop_length)
- STFT_corr = np.abs(STFT)*STFT_corr_coeff
-
- logmel_data = np.zeros((num_freq_bin, num_time_bin, num_channel), 'float32')
- logmel_data[:,:,0] = librosa.feature.melspectrogram(S=np.abs(STFT_corr)**2, sr=sr, n_fft=num_fft, hop_length=hop_length, n_mels=num_freq_bin, fmin=0.0, fmax=sr/2, htk=True, norm=None)
-
- logmel_data = np.log(logmel_data+1e-8)
-
- # feat_data = logmel_data
- # feat_data = (feat_data - np.min(feat_data)) / (np.max(feat_data) - np.min(feat_data))
- # feature_data = {'feat_data': feat_data,}
-
- for j in range(len(logmel_data[:,:,0][:,0])):
- mean = np.mean(logmel_data[:,:,0][j,:])
- std = np.std(logmel_data[:,:,0][j,:])
- logmel_data[:,:,0][j,:] = ((logmel_data[:,:,0][j,:]-mean)/std)
- logmel_data[:,:,0][np.isnan(logmel_data[:,:,0])]=0.
-
- feature_data = {'feat_data': logmel_data}
-
- cur_file_name = output_path + wavpath[i][5:-4] + '-speccorr.' + feature_type
- pickle.dump(feature_data, open(cur_file_name, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)
-
- if i%200 == 199:
- print ("%i/%i samples done" % (i+1, len(wavpath)))
|