|
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from torch.nn.modules.activation import MultiheadAttention
-
- import numpy as np
-
- class LSTMAttention(nn.Module):
- def __init__(self, config, feature_type='acoustic'):
- super().__init__()
- assert feature_type in ['acoustic','semantic']
- self.feature_type = feature_type
- if self.feature_type == 'acoustic':
- self.acoustic_lstm = nn.LSTM(
- config['acoustic']['embedding_dim'],
- config['acoustic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.classifier = nn.Linear(
- 2*config['semantic']['hidden_dim'],
- config['classifier']['class_num']
- )
-
- self.attention = nn.Parameter(
- torch.randn(2*config['acoustic']['hidden_dim'])
- )
-
- else:
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_lstm = nn.LSTM(
- config['semantic']['embedding_dim'],
- config['semantic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.classifier = nn.Linear(
- 2*config['semantic']['hidden_dim'],
- config['classifier']['class_num']
- )
-
- self.attention = nn.Parameter(
- torch.randn(2*config['semantic']['hidden_dim'])
- )
-
- self.loss_name = config['loss']['name']
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- if self.feature_type == 'acoustic':
- # use the rnn for embedding
- acoustic_pack = nn.utils.rnn.pack_padded_sequence(
- acoustic_input, acoustic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- acoustic_embed, _ = self.acoustic_lstm(acoustic_pack)
- acoustic_embed, _ = nn.utils.rnn.pad_packed_sequence(acoustic_embed, batch_first=True) # [B,A,D]
-
- # mask some of the attention score # [B,A,1]
- attention_mask = torch.arange(
- acoustic_input.size(1))[None,:].repeat(acoustic_input.size(0),1
- ).to(acoustic_input.device)
- attention_mask = (attention_mask<acoustic_length[:,None].repeat(1,acoustic_input.size(1))).float()[:,:,None] # [B,A,1]
- else:
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_pack = nn.utils.rnn.pack_padded_sequence(
- semantic_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- semantic_embed, _ = self.semantic_lstm(semantic_pack)
- semantic_embed, _ = nn.utils.rnn.pad_packed_sequence(semantic_embed, batch_first=True) # [B,A,D]
-
- # mask some of the attention score # [B,A,1]
- attention_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- attention_mask = (attention_mask<semantic_length[:,None].repeat(1,semantic_input.size(1))).float()[:,:,None] # [B,A,1]
-
- if self.loss_name == 'BCE':
- if self.feature_type == 'acoustic':
- # Then we need to use attention to find the result
- attention_score = torch.matmul(acoustic_embed, self.attention[None,:,None].repeat(acoustic_input.size(0),1,1)) #[B,A,1]
- else:
- attention_score = torch.matmul(semantic_embed, self.attention[None,:,None].repeat(semantic_input.size(0),1,1)) #[B,A,1]
-
- attention_score = attention_score / np.sqrt(self.attention.size(0))
-
- attention_score = attention_score*attention_mask - 1e6*(1-attention_mask)
- attention_score = F.softmax(attention_score, dim=1)
-
- if self.feature_type == 'acoustic':
- acoustic_embed = torch.matmul(attention_score.permute(0,2,1),acoustic_embed).squeeze(1) # [B,D]
- logits = self.classifier(acoustic_embed)
- else:
- semantic_embed = torch.matmul(attention_score.permute(0,2,1),semantic_embed).squeeze(1) # [B,D]
- logits = self.classifier(semantic_embed)
-
- elif self.loss_name == 'CTC':
- if self.feature_type == 'acoustic':
- logits = self.classifier(acoustic_embed) # [B,T,Dim]
- else:
- logits = self.classifier(semantic_embed) # [B,T,Dim]
-
- logits = F.log_softmax(logits)
- logits = logits * attention_mask
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
-
- class NeoMHA2(nn.Module):
- def __init__(self, config):
- super().__init__()
- # Here we define the two different encoder - acoustic encoder
- self.acoustic_lstm = nn.LSTM(
- config['acoustic']['embedding_dim'],
- config['acoustic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
- # Here we define the two different encoder - semantic encoder
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_lstm = nn.LSTM(
- config['semantic']['embedding_dim'],
- config['semantic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.classifier = nn.Linear(
- 2*config['semantic']['hidden_dim']+2*2*config['semantic']['hidden_dim'],
- config['classifier']['class_num']
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- acoustic_pack = nn.utils.rnn.pack_padded_sequence(
- acoustic_input, acoustic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- acoustic_embed, acoustic_hidden = self.acoustic_lstm(acoustic_pack)
- acoustic_embed, _ = nn.utils.rnn.pad_packed_sequence(acoustic_embed, batch_first=True) # [B,A,D]
- acoustic_hidden_0 = acoustic_hidden[0].view([-1,1,acoustic_embed.size(2)])[:,-1,:] # # [B,Dim*2]
-
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_pack = nn.utils.rnn.pack_padded_sequence(
- semantic_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- semantic_embed, semantic_hidden = self.semantic_lstm(semantic_pack)
- semantic_embed, _ = nn.utils.rnn.pad_packed_sequence(semantic_embed, batch_first=True) # [B,A,D]
- # semantic_hidden_0 = semantic_hidden[0].view([-1,1,semantic_embed.size(2)])[:,-1,:]
-
- a1 = torch.matmul(acoustic_hidden_0[:,None,:],semantic_embed.permute(0,2,1))
- a1_mask = torch.arange(semantic_embed.size(1))[None,:].repeat(semantic_embed.size(0),1).to(semantic_embed.device)
- a1_mask = (a1_mask < semantic_length[:,None].repeat(1,semantic_embed.size(1))).float().unsqueeze(1)
- a1_mask = (1.0 - a1_mask) * -10000.0
-
- a1 = a1 + a1_mask
- a1 = nn.Softmax(dim=-1)(a1)
- semantic_hidden_0 = torch.matmul(a1,semantic_embed).squeeze(1)
-
- fuse_hidden_0 = torch.cat([acoustic_hidden_0, semantic_hidden_0],dim=-1)
-
- # Here we apply the second hop attention
- a2 = torch.matmul(semantic_hidden_0[:,None,:],acoustic_embed.permute(0,2,1))
- a2_mask = torch.arange(acoustic_embed.size(1))[None,:].repeat(acoustic_embed.size(0),1).to(acoustic_embed.device)
- a2_mask = (a2_mask < acoustic_length[:,None].repeat(1,acoustic_embed.size(1))).float().unsqueeze(1)
- a2_mask = (1.0 - a2_mask) * -10000.0
-
- a2 = a2 + a2_mask
- a2 = nn.Softmax(dim=-1)(a2)
- acoustic_hidden_1 = torch.matmul(a2,acoustic_embed).squeeze(1)
-
- fuse_hidden_1 = torch.cat([fuse_hidden_0, acoustic_hidden_1],dim=-1)
-
- logits = self.classifier(fuse_hidden_1)
-
- return logits
-
-
- class NeoExcite(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,config['acoustic']['hidden_dim'],2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
-
- self.classifier = nn.Linear(2*config['fusion']['hidden_dim'],config['classifier']['class_num'])
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- ## for the align result, we need to normalize it into summation equals 1
- acoustic_align = acoustic_align - (acoustic_align == 0).float() * 1e6
- acoustic_align = F.softmax(acoustic_align, dim=3)
- ## based on that align results we put the feature into the alignment results
- acoustic_embed = torch.matmul(
- torch.squeeze(acoustic_align,1),acoustic_embed.permute(0,2,1)) # [B,T,C]
- # then we use the cross modal excitement information
- acoustic_excit = F.sigmoid(self.acoustic_excit(semantic_input))
- semantic_excit = F.sigmoid(self.semantic_excit(acoustic_embed))
- acoustic_embed = acoustic_embed * acoustic_excit
- semantic_embed = semantic_embed * semantic_excit
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
-
- logits = self.classifier(fuse_embed)
- return logits
-
-
- class NeoMeanMaxExcite(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,int(config['acoustic']['hidden_dim']/2),2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']+1*int(self.loss_name=='CTC')
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- acoustic_embed = acoustic_embed.permute(0,2,1)[:,None,:,:].repeat(1,semantic_embed.size(1),1,1) # [B,T,A,C]
- acoustic_align = (acoustic_align.squeeze(1)[:,:,:,None] > 0).float() # [B,T,A,1]
- # Think about the new way of calculate the mean
- acoustic_mean = torch.sum(acoustic_embed*acoustic_align, dim=2) / (torch.sum(acoustic_align, dim=2) + 1e-6) # [B,T,C]
- # Think about the new way of calculate the max
- acoustic_max, _ = torch.max(acoustic_embed*acoustic_align-1e6*(1.0-acoustic_align), dim=2)
- # concat it with both embed
- acoustic_embed = torch.cat([acoustic_mean,acoustic_max], dim=-1)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = F.sigmoid(self.semantic_excit(acoustic_embed))
- semantic_embed = semantic_embed * semantic_excit
- if self.acoustic_excit is not None:
- acoustic_excit = F.sigmoid(self.acoustic_excit(semantic_input))
- acoustic_embed = acoustic_embed * acoustic_excit
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
-
- class NeoDiDi(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_lstm = nn.LSTM(
- config['semantic']['embedding_dim'],
- config['semantic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.acoustic_lstm = nn.LSTM(
- config['acoustic']['embedding_dim'],
- config['acoustic']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']*2+config['acoustic']['hidden_dim']*2,
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- self.align_attention = MultiheadAttention(
- 2*config['semantic']['hidden_dim'],
- config['fusion']['num_heads'], dropout=0.5
- )
-
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']
- )
-
-
- self.loss_name = config['loss']['name']
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,
- ):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_pack = nn.utils.rnn.pack_padded_sequence(
- semantic_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- semantic_embed, _ = self.semantic_lstm(semantic_pack)
- semantic_embed, _ = nn.utils.rnn.pad_packed_sequence(semantic_embed, batch_first=True)
- # first perform the encode for the first-step acoustic partterns
- acoustic_pack = nn.utils.rnn.pack_padded_sequence(
- acoustic_input, acoustic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- acoustic_embed, _ = self.acoustic_lstm(acoustic_pack)
- acoustic_embed, _ = nn.utils.rnn.pad_packed_sequence(acoustic_embed, batch_first=True)
-
- # print(semantic_embed.shape)
- # print(acoustic_embed.shape)
-
- acoustic_embed, _ = self.align_attention(
- semantic_embed.permute(1,0,2), acoustic_embed.permute(1,0,2), acoustic_embed.permute(1,0,2),
- )
- acoustic_embed = acoustic_embed.permute(1,0,2)
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
-
- class NeoMeanMaxExcite_v2(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,int(config['acoustic']['hidden_dim']/2),2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']+1*int(self.loss_name=='CTC')
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- acoustic_embed = acoustic_embed.permute(0,2,1)[:,None,:,:].repeat(1,semantic_embed.size(1),1,1) # [B,T,A,C]
- acoustic_align = (acoustic_align.squeeze(1)[:,:,:,None] > 0).float() # [B,T,A,1]
- # Think about the new way of calculate the mean
- acoustic_mean = torch.sum(acoustic_embed*acoustic_align, dim=2) / (torch.sum(acoustic_align, dim=2) + 1e-6) # [B,T,C]
- # Think about the new way of calculate the max
- acoustic_max, _ = torch.max(acoustic_embed*acoustic_align-1e6*(1.0-acoustic_align), dim=2)
- # concat it with both embed
- acoustic_embed = torch.cat([acoustic_mean,acoustic_max], dim=-1)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = F.sigmoid(self.semantic_excit(acoustic_embed))
- semantic_embed = semantic_embed * semantic_excit + semantic_embed # These two lines are different, we add the residual connection
- if self.acoustic_excit is not None:
- acoustic_excit = F.sigmoid(self.acoustic_excit(semantic_input))
- acoustic_embed = acoustic_embed * acoustic_excit + acoustic_embed # These two lines are different, we add the residual connection
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
-
- class NeoMeanMaxExcite_v3(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,int(config['acoustic']['hidden_dim']/2),2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']+1*int(self.loss_name=='CTC')
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- acoustic_embed = acoustic_embed.permute(0,2,1)[:,None,:,:].repeat(1,semantic_embed.size(1),1,1) # [B,T,A,C]
- acoustic_align = (acoustic_align.squeeze(1)[:,:,:,None] > 0).float() # [B,T,A,1]
- # Think about the new way of calculate the mean
- acoustic_mean = torch.sum(acoustic_embed*acoustic_align, dim=2) / (torch.sum(acoustic_align, dim=2) + 1e-6) # [B,T,C]
- # Think about the new way of calculate the max
- acoustic_max, _ = torch.max(acoustic_embed*acoustic_align-1e6*(1.0-acoustic_align), dim=2)
- # concat it with both embed
- acoustic_embed = torch.cat([acoustic_mean,acoustic_max], dim=-1)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = self.semantic_excit(acoustic_embed)
- semantic_embed = semantic_embed + semantic_excit
- if self.acoustic_excit is not None:
- acoustic_excit = self.acoustic_excit(semantic_input)
- acoustic_embed = acoustic_embed + acoustic_excit
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
-
- class NeoMeanMaxExcite_v4(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,int(config['acoustic']['hidden_dim']/2),2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']+1*int(self.loss_name=='CTC')
- )
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- acoustic_embed = acoustic_embed.permute(0,2,1)[:,None,:,:].repeat(1,semantic_embed.size(1),1,1) # [B,T,A,C]
- acoustic_align = (acoustic_align.squeeze(1)[:,:,:,None] > 0).float() # [B,T,A,1]
- # Think about the new way of calculate the mean
- acoustic_mean = torch.sum(acoustic_embed*acoustic_align, dim=2) / (torch.sum(acoustic_align, dim=2) + 1e-6) # [B,T,C]
- # Think about the new way of calculate the max
- acoustic_max, _ = torch.max(acoustic_embed*acoustic_align-1e6*(1.0-acoustic_align), dim=2)
- # concat it with both embed
- acoustic_embed = torch.cat([acoustic_mean,acoustic_max], dim=-1)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = self.semantic_excit(acoustic_embed)
- semantic_embed = semantic_embed + semantic_excit
- if self.acoustic_excit is not None:
- acoustic_excit = F.sigmoid(self.acoustic_excit(semantic_input))
- acoustic_embed = acoustic_embed * acoustic_excit
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits
-
- # This is what we copy from the Multimodal Transformer
- # To implement that idea, probably we need to pad the sequence into the same length?
- class CTCModule(nn.Module):
- def __init__(self, in_dim, out_seq_len):
- '''
- This module is performing alignment from A (e.g., audio) to B (e.g., text).
- :param in_dim: Dimension for input modality A
- :param out_seq_len: Sequence length for output modality B
- '''
- super(CTCModule, self).__init__()
- # Use LSTM for predicting the position from A to B
- self.pred_output_position_inclu_blank = nn.LSTM(in_dim, out_seq_len+1, num_layers=2, batch_first=True) # 1 denoting blank
-
- self.out_seq_len = out_seq_len
-
- self.softmax = nn.Softmax(dim=2)
-
- def forward(self, x):
- '''
- :input x: Input with shape [batch_size x in_seq_len x in_dim]
- '''
- # NOTE that the index 0 refers to blank.
- pred_output_position_inclu_blank, _ = self.pred_output_position_inclu_blank(x)
-
- prob_pred_output_position_inclu_blank = self.softmax(pred_output_position_inclu_blank) # batch_size x in_seq_len x out_seq_len+1
- prob_pred_output_position = prob_pred_output_position_inclu_blank[:, :, 1:] # batch_size x in_seq_len x out_seq_len
- prob_pred_output_position = prob_pred_output_position.transpose(1,2) # batch_size x out_seq_len x in_seq_len
- pseudo_aligned_out = torch.bmm(prob_pred_output_position, x) # batch_size x out_seq_len x in_dim
-
- # pseudo_aligned_out is regarded as the aligned A (w.r.t B)
- return pseudo_aligned_out, (pred_output_position_inclu_blank)
-
-
-
- class NeoMeanMaxExciteCTC(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,config['acoustic']['hidden_dim'],2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- self.align_ctc = CTCModule(config['acoustic']['hidden_dim'],config['semantic']['max_length'])
- self.logsoftmax = nn.LogSoftmax(dim=2)
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
-
- acoustic_embed, align_logits = self.align_ctc(acoustic_embed.permute(0,2,1))
- align_logits = self.logsoftmax(align_logits)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = self.semantic_excit(acoustic_embed)
- semantic_embed = semantic_embed + semantic_excit
- if self.acoustic_excit is not None:
- acoustic_excit = self.acoustic_excit(semantic_input)
- acoustic_embed = acoustic_embed + acoustic_excit
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- fuse_embed.size(1))[None,:].repeat(fuse_embed.size(0),1
- ).to(fuse_embed.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,fuse_embed.size(1))).float()
-
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- return logits, align_logits
-
-
-
- class NeoMeanMaxExciteVisual_v3(nn.Module):
- def __init__(self, config):
- super().__init__()
- if config['semantic']['embedding_path'] is not None:
- semantic_embed = np.load(config['semantic']['embedding_path'])
- semantic_embed = np.concatenate([np.zeros([1,semantic_embed.shape[1]]),semantic_embed],axis=0)
- self.semantic_embed = nn.Embedding.from_pretrained(torch.FloatTensor(semantic_embed), freeze=False)
- else:
- self.semantic_embed = nn.Embedding(config['semantic']['embedding_size']+1, config['semantic']['embedding_dim'])
-
- self.semantic_linear = nn.Linear(config['semantic']['embedding_dim'], config['semantic']['hidden_dim'])
- # This the embedding for the audio features
- # self.acoustic_cnn1 = nn.Conv1d(34,64,5,1)
- self.acoustic_cnn1 = nn.Conv1d(config['acoustic']['embedding_dim'],64,5,1)
- self.acoustic_cnn2 = nn.Conv1d(64,128,2,1)
- self.acoustic_cnn3 = nn.Conv1d(128,int(config['acoustic']['hidden_dim']/2),2,1)
- self.acoustic_mean1 = nn.AvgPool2d((1,5),(1,1))
- self.acoustic_mean2 = nn.AvgPool2d((1,2),(1,1))
- self.acoustic_mean3 = nn.AvgPool2d((1,2),(1,1))
-
- # This the embedding for the semantic features
- self.fuse_lstm = nn.LSTM(
- config['semantic']['hidden_dim']+config['acoustic']['hidden_dim'],
- config['fusion']['hidden_dim'], 1, bidirectional=True,
- batch_first=True, dropout=0.5
- )
-
- # Add the cross-modal excitement layer
- if config['acoustic']['excite']:
- self.acoustic_excit = nn.Embedding(config['semantic']['embedding_size']+1, config['acoustic']['hidden_dim'])
- else:
- self.acoustic_excit = None
- if config['semantic']['excite']:
- self.semantic_excit = nn.Linear(config['acoustic']['hidden_dim'], config['semantic']['hidden_dim'])
- else:
- self.semantic_excit = None
-
- self.loss_name = config['loss']['name']
- self.classifier = nn.Linear(
- 2*config['fusion']['hidden_dim'],
- config['classifier']['class_num']+1*int(self.loss_name=='CTC')
- )
-
-
- def forward(
- self,
- acoustic_input,
- acoustic_length,
- semantic_input,
- semantic_length,
- align_input,):
- # first perform the encode for the first-step semantic partterns
- semantic_embed = self.semantic_embed(semantic_input) # [B,T,C]
- semantic_embed = self.semantic_linear(semantic_embed)
- # first perform the encode for the first-step acoustic partterns
- acoustic_embed = self.acoustic_cnn1(acoustic_input.permute(0,2,1))
- acoustic_align = self.acoustic_mean1(align_input[:,None,:,:])
-
- acoustic_embed = self.acoustic_cnn2(acoustic_embed)
- acoustic_align = self.acoustic_mean2(acoustic_align)
-
- acoustic_embed = self.acoustic_cnn3(acoustic_embed) # [B,C,A]
- acoustic_align = self.acoustic_mean3(acoustic_align) # [B,1,T,A]
-
- acoustic_embed = acoustic_embed.permute(0,2,1)[:,None,:,:].repeat(1,semantic_embed.size(1),1,1) # [B,T,A,C]
- acoustic_align = (acoustic_align.squeeze(1)[:,:,:,None] > 0).float() # [B,T,A,1]
- # Think about the new way of calculate the mean
- acoustic_mean = torch.sum(acoustic_embed*acoustic_align, dim=2) / (torch.sum(acoustic_align, dim=2) + 1e-6) # [B,T,C]
- # Think about the new way of calculate the max
- acoustic_max, _ = torch.max(acoustic_embed*acoustic_align-1e6*(1.0-acoustic_align), dim=2)
- # concat it with both embed
- acoustic_embed = torch.cat([acoustic_mean,acoustic_max], dim=-1)
-
- # then we use the cross modal excitement information
- if self.semantic_excit is not None:
- semantic_excit = self.semantic_excit(acoustic_embed)
- semantic_embed = semantic_embed + semantic_excit
- if self.acoustic_excit is not None:
- acoustic_excit = self.acoustic_excit(semantic_input)
- acoustic_embed = acoustic_embed + acoustic_excit
-
- fuse_embed = torch.cat([semantic_embed,acoustic_embed],dim=2)
- # Then we use the fuse lstm to encode the multimodal information
- fuse_pack = nn.utils.rnn.pack_padded_sequence(
- fuse_embed, semantic_length.cpu(), batch_first=True, enforce_sorted=False
- )
- fuse_embed, _ = self.fuse_lstm(fuse_pack)
- fuse_embed, _ = nn.utils.rnn.pad_packed_sequence(
- fuse_embed, batch_first=True
- )
- # Here we get the final results, we use the max pooling to generate the results
- fuse_mask = torch.arange(
- semantic_input.size(1))[None,:].repeat(semantic_input.size(0),1
- ).to(semantic_input.device)
- fuse_mask = (fuse_mask < semantic_length[:,None].repeat(1,semantic_input.size(1))).float()
-
- if self.loss_name == 'BCE':
- fuse_embed = fuse_embed - (1 - fuse_mask[:,:,None]) * 1e6
- fuse_embed = torch.max(fuse_embed, dim=1)[0]
- logits = self.classifier(fuse_embed)
-
- elif self.loss_name == 'CTC':
- logits = self.classifier(fuse_embed) # [B,T,Dim]
- logits = F.log_softmax(logits)
- logits = logits * fuse_mask[:,:,None]
-
- else:
- raise ValueError('Loss type not supported!')
-
- return logits, fuse_embed
|