|
- # -*- coding = utf-8 -*-
- '''
- # @time:2023/2/19 15:44
- # Author:DFTL
- # @File:Module.py
- '''
-
- import torch
- import torch.nn as nn
- from torch.nn import CrossEntropyLoss,MSELoss
- from transformers import BertTokenizer,BertModel
- from transformers.models.bert.configuration_bert import BertConfig
- import numpy as np
- import torch.nn.functional as F
- from Models.Cross_Attention import Cross_Attention_v2,Cross_Attention
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-
-
-
-
- '''词向量的维度即为图的通道数,token_len是指词向量的个数'''
- class Semantic_Token(nn.Module):
- def __init__(self, inchanels, token_len):
- super(Semantic_Token, self).__init__()
-
- self.inchanels = inchanels
- self.token_len = token_len
- self.conv_a = nn.Conv2d(self.inchanels, self.token_len, kernel_size=1, padding=0, bias=False)
-
- def forward(self, x):
- b, c, h, w = x.shape
- spatial_attention = self.conv_a(x) # 此处降维
- spatial_attention = spatial_attention.view([b, self.token_len, -1]).contiguous() # contiguous() 开辟新内存
- spatial_attention = torch.softmax(spatial_attention, dim=-1)
- x = x.view([b, c, -1]).contiguous()
- tokens = torch.einsum('bln,bcn->blc', spatial_attention, x) #n*l*c
-
- return torch.squeeze(tokens,1)
-
- class Semantic_Token_v2(nn.Module):
- def __init__(self,in_channel,model_dim ,kenel = 9,embed_dim = 64,num_class=12):
- super(Semantic_Token_v2,self).__init__()
- # self.conv1 = nn.Conv2d()
- # self.pool = nn.AdaptiveAvgPool2d
- self.model_dim = model_dim
- self.num_class = num_class
-
- self.image2emb = nn.Conv2d(in_channel,model_dim,kenel,stride=kenel)
-
- self.encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=8)
- self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=6)
-
- self.linear_layer = nn.Linear(model_dim, embed_dim)
- self.class_layer = nn.Linear(embed_dim, num_class)
-
- def forward(self, x):
-
- # step1: convert image to embdding
- patches = self.image2emb(x)
- bs, oc, oh, ow = patches.shape
- model_dim = oc
- patch_embedding = patches.reshape(bs, oc, oh*ow).transpose(-1, -2)
-
- # step2: CLS token emb
- cls_token_embedding = torch.randn(bs, 1, model_dim, requires_grad=True).to(device)
- # print(cls_token_embedding)
- token_embedding = torch.cat([cls_token_embedding, patch_embedding], dim=1)
-
- # step3:position embedding
- position_embedding_table = torch.randn(64, model_dim, requires_grad=True).to(device)
- seq_len = token_embedding.shape[1]
- # position_embedding = torch.tile(position_embedding_table[0:seq_len], [token_embedding.shape[0], 1, 1])
- position_embedding = position_embedding_table[0:seq_len].repeat([token_embedding.shape[0], 1, 1])
- token_embedding += position_embedding
-
- # step4: pass embedding to Transformer Encoder
- # encoder_layer = nn.TransformerEncoderLayer(d_model=model_dim, nhead=8)
- # transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
- encoder_output = self.transformer_encoder(token_embedding)
-
- # step5: classification
- cls_token_output = encoder_output[:, 0, :]
- # linear_layer = nn.Linear(model_dim, self.num_class)
- token = self.linear_layer(cls_token_output)
- output = self.class_layer(token)
-
- return token,output
-
- '''事件嵌入'''
- class Event_embeding(nn.Module):
- def __init__(self, max_len, pretrained_bert):
- super(Event_embeding, self).__init__()
- self.max_len = max_len
- self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert)
- self.bert = BertModel.from_pretrained(pretrained_bert)
- # self.out_dim = out_dim
- self.linear = nn.Linear(768, 64)
- self.relu = nn.ReLU()
-
- def forward(self, text):
- tokens = self.tokenizer(text, max_length=self.max_len, truncation=True)['input_ids']
- token_ids = torch.tensor([tokens], dtype=torch.long)
- encoded_text = self.bert(token_ids)[1]
- output = self.linear(encoded_text)
- output = self.relu(output)
-
- return output
-
- class Event_embeding_v2(nn.Module):
- def __init__(self):
- super(Event_embeding_v2, self).__init__()
- self.linear1 = nn.Linear(768,128)
- self.linear2 = nn.Linear(128,64)
- self.relu = nn.ReLU()
-
- def forward(self,input):
- output = self.linear1(input)
- output = self.linear2(output)
- output = self.relu(output)
-
- return output
-
- class Event_embeding_v3(nn.Module):
- def __init__(self,bert_path,config_path,embed_dim):
- super(Event_embeding_v3, self).__init__()
-
- bert_config = BertConfig.from_json_file(config_path)
- self.event_encoder = BertModel.from_pretrained(bert_path,ignore_mismatched_sizes=True,config = bert_config)
- self.event_proj = nn.Linear(self.event_encoder.config.hidden_size, embed_dim)
-
- def forward(self,event):
-
- # event_out = self.event_encoder()
- event_output = self.event_encoder(event.input_ids, attention_mask=event.attention_mask,
- return_dict=True)
- text_embeds = event_output.last_hidden_state
- event_output = self.event_proj(text_embeds[:, 0, :])
-
- return event_output
-
- '''提取深层特征'''
- class DeepFeature(nn.Module):
- def __init__(self):
- super(DeepFeature, self).__init__()
- self.conv1 = nn.Conv3d(1, 8, (7, 3, 3))
- # conv2:(8, 24, 23, 23), 16个 5x3x3 的卷积核 ==>(16, 20, 21, 21)
- self.conv2 = nn.Conv3d(8, 16, (5, 3, 3))
- # conv3:(16, 20, 21, 21),32个 3x3x3 的卷积核 ==>(32, 18, 19, 19)
- # self.conv3 = nn.Conv3d(16, 32, (3, 3, 3))
- # conv3_2d (576, 19, 19),64个 3x3 的卷积核 ==>((64, 17, 17)
- self.conv3_2d = nn.Conv2d(352, 64, (3, 3))
-
- self.bn1 = nn.BatchNorm3d(8)
- self.bn2 = nn.BatchNorm3d(16)
- self.bn3 = nn.BatchNorm2d(64)
-
- self.relu = nn.ReLU()
-
- def forward(self, input):
- # 三维卷积扩充一个维度
- input = torch.unsqueeze(input, 1)
-
- output = self.relu(self.conv1(input))
- output = self.bn1(output)
- output = self.relu(self.conv2(output))
- output = self.bn2(output)
- output = output.view(-1, output.shape[1] * output.shape[2], output.shape[3],
- output.shape[4])
- output = self.relu(self.conv3_2d(output))
-
- return output
-
- '''分类头'''
- class Classifier_head(nn.Module):
- def __init__(self):
- super(Classifier_head, self).__init__()
- self.conv1 = nn.Conv2d(64,32,3)
- self.conv2 = nn.Conv2d(32,1,3)
- self.flatten = nn.Flatten()
- self.fc1 = nn.Linear(64*21*21,256)
- self.fc2 = nn.Linear(256,128)
- self.fc3 = nn.Linear(128,12)
- self.drop = nn.Dropout(p=0.4)
-
- def forward(self,input):
-
- output = self.flatten(input)
-
- output = self.fc1(output)
- output = self.drop(output)
- output = self.fc2(output)
- output = self.drop(output)
- output = self.fc3(output)
-
- return output
-
- class Classifier_head_event(nn.Module):
- def __init__(self):
- super(Classifier_head_event, self).__init__()
-
- self.classifier = nn.Linear(64,12)
-
- def forward(self,input):
-
- return self.classifier(input)
-
- '''MyModule'''
- class MyModule(nn.Module):
- def __init__(self,args):
- super(MyModule, self).__init__()
-
- self.args = args
-
- self.deepfeature = DeepFeature()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding(self.args.max_len,self.args.pretrained_bert)
-
- self.linear = nn.Linear()
-
- def forward(self,input1,input2):
-
- output = self.deepfeature(input1)
-
- output = self.semantic_token(output)
-
- output_list = []
- if len(input2)!=0:
- for input in input2:
- output_list.append(self.event_embeding(input))
-
- return output,output_list
-
- class MyModule_v1(nn.Module):
- def __init__(self):
- super(MyModule_v1, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
-
- self.semantic_token = Semantic_Token(64, 1)
-
- self.event_embeding = Event_embeding_v2()
-
- # 添加线性映射,将两类特征映射至同一空间
- self.linear = nn.Linear(64, 64)
-
- def forward(self, loss_, label, input1, encoded_event1=None):
- batch_size = label.shape[0]
-
- image_features = self.deepfeature(input1) # ()
- output2 = self.classifier(image_features)
-
- image_token = self.semantic_token(image_features)
-
- # if args.mode = "train"
-
- encoded_event1 = self.event_embeding(encoded_event1)
- # encoded_event2 = self.event_embeding(encoded_event2)
- # encoded_event3 = self.event_embeding(encoded_event3)
-
- # 特征归一化
- image_token = image_token / image_token.norm(dim=1, keepdim=True)
- encoded_event = encoded_event1 / encoded_event1.norm(dim=1, keepdim=True)
-
- # 映射至同一空间
- image_token = self.linear(image_token)
- encoded_event = self.linear(encoded_event)
-
- ##loss
-
- # loss = loss_(output2, label.long())
-
- ##loss1
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- label_align = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image, label_align.long())
- loss_e = loss_(logits_per_event, label_align.long())
-
- loss1 = (loss_i + loss_e) / 2
-
- return loss1 # image_token,encoded_event #分类头结果、##
-
- class MyModuel_v2(nn.Module):
- def __init__(self):
- super(MyModuel_v2, self).__init__()
-
- # self.args = args
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v2()
-
- def forward(self,loss_,label,input1,encoded_event1,encoded_event2,encoded_event3):
- batch_size = label.shape[0]
-
- image_features = self.deepfeature(input1) #()
- output2 = self.classifier(image_features)
-
- image_token = self.semantic_token(image_features)
-
- encoded_event1 = self.event_embeding(encoded_event1)
- encoded_event2 = self.event_embeding(encoded_event2)
- encoded_event3 = self.event_embeding(encoded_event3)
-
- # 特征归一化
- image_token = image_token / image_token.norm(dim=1, keepdim=True)
- encoded_event = encoded_event1 / encoded_event1.norm(dim=1, keepdim=True)
-
- ##loss
-
- loss = loss_(output2,label.long())
-
- ##loss1
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- label = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image.to(device), label.long())
- loss_e = loss_(logits_per_event, label.long())
-
- loss1 = (loss_i + loss_e) / 2
-
- ##loss2
-
- encoded_event = encoded_event2 / encoded_event2.norm(dim=1, keepdim=True)
-
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- #label = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image, label.long())
- loss_e = loss_(logits_per_event, label.long())
-
- loss2 = (loss_i + loss_e) / 2
-
- ##loss3
-
- encoded_event = encoded_event3 / encoded_event3.norm(dim=1, keepdim=True)
-
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- #label = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image, label.long())
- loss_e = loss_(logits_per_event, label.long())
-
- loss3 = (loss_i + loss_e) / 2
-
- loss1 = (loss1 + loss2 + loss3) / 3
- return loss,loss1 #分类头结果、
-
- class MyModuel_v3(nn.Module): #只有标签对齐的实验
- def __init__(self):
- super(MyModuel_v3, self).__init__()
-
- # self.args = args
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v2()
-
- def forward(self,loss_,label,input1,encoded_event1=None):
- batch_size = label.shape[0]
-
- image_features = self.deepfeature(input1) #()
- output2 = self.classifier(image_features)
-
- image_token = self.semantic_token(image_features)
-
- # if args.mode = "train"
-
- encoded_event1 = self.event_embeding(encoded_event1)
- # encoded_event2 = self.event_embeding(encoded_event2)
- # encoded_event3 = self.event_embeding(encoded_event3)
-
- # 特征归一化
- image_token = image_token / image_token.norm(dim=1, keepdim=True)
- encoded_event = encoded_event1 / encoded_event1.norm(dim=1, keepdim=True)
-
- ##loss
-
- loss = loss_(output2,label.long())
-
- ##loss1
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- label_align = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image, label_align.long())
- loss_e = loss_(logits_per_event, label_align.long())
-
- loss1 = (loss_i + loss_e) / 2
-
- ##loss2
-
- # encoded_event = encoded_event2 / encoded_event2.norm(dim=1, keepdim=True)
- #
- # # 计算余弦相似度
- # logit_scale = self.logit_scale.exp()
- # logits_per_image = logit_scale * image_token @ encoded_event.t()
- # logits_per_event = logits_per_image.t()
- #
- # label = torch.arange(batch_size)
- # # loss_i = F.cross_entropy(logits_per_image, label.long())
- # # loss_e = F.cross_entropy(logits_per_event, label.long())
- # loss_i = loss_(logits_per_image, label.long())
- # loss_e = loss_(logits_per_event, label.long())
- #
- # loss2 = (loss_i + loss_e) / 2
- #
- # ##loss3
- #
- # encoded_event = encoded_event3 / encoded_event3.norm(dim=1, keepdim=True)
- #
- # # 计算余弦相似度
- # logit_scale = self.logit_scale.exp()
- # logits_per_image = logit_scale * image_token @ encoded_event.t()
- # logits_per_event = logits_per_image.t()
- #
- # label = torch.arange(batch_size)
- # # loss_i = F.cross_entropy(logits_per_image, label.long())
- # # loss_e = F.cross_entropy(logits_per_event, label.long())
- # loss_i = loss_(logits_per_image, label.long())
- # loss_e = loss_(logits_per_event, label.long())
- #
- # loss3 = (loss_i + loss_e) / 2
-
- # loss1 = (loss1 + loss2 + loss3) / 3
- return loss,loss1 #分类头结果、
-
- class MyModuel_v4(nn.Module): #映射至同一空间,
- def __init__(self):
- super(MyModuel_v4, self).__init__()
-
- # self.args = args
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v2()
-
- #添加线性映射,将两类特征映射至同一空间
- self.linear = nn.Linear(64,64)
-
- def forward(self,loss_,label,input1,encoded_event1=None):
- batch_size = label.shape[0]
-
- image_features = self.deepfeature(input1) #()
- output2 = self.classifier(image_features)
-
- image_token = self.semantic_token(image_features)
-
- # if args.mode = "train"
-
- encoded_event1 = self.event_embeding(encoded_event1)
- # encoded_event2 = self.event_embeding(encoded_event2)
- # encoded_event3 = self.event_embeding(encoded_event3)
-
- # 特征归一化
- image_token = image_token / image_token.norm(dim=1, keepdim=True)
- encoded_event = encoded_event1 / encoded_event1.norm(dim=1, keepdim=True)
-
- #映射至同一空间
- image_token = self.linear(image_token)
- encoded_event = self.linear(encoded_event)
-
-
- ##loss
-
- loss = loss_(output2,label.long())
-
- ##loss1
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- label_align = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = loss_(logits_per_image, label_align.long())
- loss_e = loss_(logits_per_event, label_align.long())
-
- loss1 = (loss_i + loss_e) / 2
-
- return loss,loss1 #分类头结果、##
-
- class MyModule_v5(nn.Module):#
- def __init__(self,args):
- super(MyModule_v5, self).__init__()
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert,args.config_path,args.embed_dim)
-
- #添加线性映射,将两类特征映射至同一空间
- # self.proj = nn.Sequential
- # self.linear = nn.Linear(64,64)
-
- def forward(self,image,event,label):
- batch_size = image.shape[0]
-
- image_features = self.deepfeature(image) #()
- output2 = self.classifier(image_features)
-
- image_token = self.semantic_token(image_features)
-
- # if args.mode = "train"
-
- encoded_event1 = self.event_embeding(event)
- # encoded_event2 = self.event_embeding(encoded_event2)
- # encoded_event3 = self.event_embeding(encoded_event3)
-
- # 特征归一化
- image_token = image_token / image_token.norm(dim=1, keepdim=True)
- encoded_event = encoded_event1 / encoded_event1.norm(dim=1, keepdim=True)
-
- #映射至同一空间,对比
- # image_token = self.linear(image_token)
- # encoded_event = self.linear(encoded_event)
-
-
- ##loss
-
- loss = F.cross_entropy(output2,label.long())
-
- ##loss1
- # 计算余弦相似度
- logit_scale = self.logit_scale.exp()
- logits_per_image = logit_scale * image_token @ encoded_event.t()
- logits_per_event = logits_per_image.t()
-
- label_align = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(logits_per_image, label.long())
- # loss_e = F.cross_entropy(logits_per_event, label.long())
- loss_i = F.cross_entropy(logits_per_image, label_align.long())
- loss_e = F.cross_entropy(logits_per_event, label_align.long())
-
- loss1 = (loss_i + loss_e) / 2
-
- return image_token,encoded_event #loss,#分类头结果、##loss, loss1 #
-
- class MyModule_v6(nn.Module):#增加两个分类头,修改损失部分映射
- def __init__(self,args):
- super(MyModule_v6, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert,args.config_path,args.embed_dim)
-
- self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Linear(args.embed_dim,args.embed_dim)
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
- #添加线性映射,将两类特征映射至同一空间
- # self.proj = nn.Sequential
- # self.linear = nn.Linear(64,64)
- def forward(self,image,event,label):
-
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.event_embeding(event)
- encoded_event = F.normalize(self.event_proj(encoded_event),dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- sim_targets = torch.zeros(sim_i2t.size()).to(device)
- sim_targets.fill_diagonal_(1)
-
- loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
- loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
-
- loss_ita = (loss_i2t + loss_t2i) / 2
-
- loss1 = F.cross_entropy(output1,label.long())
- loss2 = F.cross_entropy(output2,label.long())
-
- return loss_ita,loss1,loss2
- #return image_token,encoded_event
-
- class MyModule_v6_1(nn.Module):#yqj 修改 MSE
- def __init__(self,args):
- super(MyModule_v6_1, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert,args.config_path,args.embed_dim)
-
- self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Linear(args.embed_dim,args.embed_dim)
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
-
- self.mse = MSELoss()
- #添加线性映射,将两类特征映射至同一空间
- # self.proj = nn.Sequential
- # self.linear = nn.Linear(64,64)
- def forward(self,image,event,label):
-
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.event_embeding(event)
- encoded_event = F.normalize(self.event_proj(encoded_event),dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- #sim_targets1 = torch.zeros(sim_i2t.size()).to(device)
- #sim_targets1.fill_diagonal_(1)
-
- #sim_targets = torch.arange(batch_size).to(device)
- #loss_i = F.cross_entropy(sim_t2i, sim_targets.long())
- #loss_e = F.cross_entropy(sim_i2t, sim_targets.long())
-
- # a = F.log_softmax(sim_i2t, dim=1) * sim_targets
- # loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
- # loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
- onehot = F.one_hot(label.long(),12).to(torch.float).to(device)
- sim_targets = onehot @ onehot.t()
- #sim_targets[torch.where(sim_targets == 0)] = -1
-
- #loss_i = F.kl_div(F.log_softmax(sim_t2i,1), F.softmax(sim_targets,1),reduction="mean")
- #loss_e = F.kl_div(F.log_softmax(sim_i2t,1), F.softmax(sim_targets,1),reduction="mean")
-
- loss_ita = self.mse(F.softmax(sim_i2t,1),sim_targets)
- #loss_ita1 = self.mse(sim_i2t,sim_targets1)
- #loss_ita = (loss_i + loss_e)/2 #+ loss_ita1
-
- loss1 = F.cross_entropy(output1,label.long())
- loss2 = F.cross_entropy(output2,label.long())
-
- #return loss_ita,loss1,loss2
- return image_token,encoded_event
-
- class MyModule_v6_2(nn.Module): #transformer
- def __init__(self,args):
- super(MyModule_v6_2, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- # self.deepfeature = DeepFeature()
- #self.
-
- self.vit4image = Semantic_Token_v2(in_channel=32,model_dim=9*9*32,embed_dim=args.embed_dim,num_class=args.num_class)
-
- # self.classifier = Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- # self.semantic_token = Semantic_Token(64,1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert,args.config_path,args.embed_dim)
-
- self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Linear(args.embed_dim,args.embed_dim)
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
-
- self.mse = MSELoss()
- #添加线性映射,将两类特征映射至同一空间
- # self.proj = nn.Sequential
- # self.linear = nn.Linear(64,64)
-
- def forward(self,image,event,label):
-
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- # image_features = self.deepfeature(image) # ()
- # output1 = self.classifier(image_features)
-
- image_token,output1 = self.vit4image(image)
-
- # image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.event_embeding(event)
- encoded_event = F.normalize(self.event_proj(encoded_event),dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- # sim_targets = torch.zeros(sim_i2t.size()).to(device)
- # sim_targets.fill_diagonal_(1)
-
- # loss_i2t = -torch.sum(F.log_softmax(sim_i2t, dim=1) * sim_targets, dim=1).mean()
- # loss_t2i = -torch.sum(F.log_softmax(sim_t2i, dim=1) * sim_targets, dim=1).mean()
-
- # sim_targets = torch.arange(batch_size).to(device)
- # loss_i = F.cross_entropy(sim_t2i, sim_targets.long())
- # loss_e = F.cross_entropy(sim_i2t, sim_targets.long())
- onehot = F.one_hot(label.long(),12).to(torch.float)
- sim_targets = onehot @ onehot.t()
- # sim_targets[torch.where(sim_targets == 0)] = -1
-
- # loss_i = F.kl_div(F.log_softmax(sim_t2i,1), F.softmax(sim_targets,1),reduction="mean")
- # loss_e = F.kl_div(F.log_softmax(sim_i2t,1), F.softmax(sim_targets,1),reduction="mean")
-
- loss_ita = self.mse(F.softmax(sim_i2t,1),sim_targets)
- #loss_ita = (loss_i + loss_e)/2
-
- loss1 = F.cross_entropy(output1,label.long())
- loss2 = F.cross_entropy(output2,label.long())
-
- return loss_ita,loss1,loss2
- #return image_token, encoded_event
-
- class MyModule_v7(nn.Module):# 两层映射,nceloss
- def __init__(self,args):
- super(MyModule_v7, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- self.semantic_token = Semantic_Token(64, 1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert, args.config_path, args.embed_dim)
-
- self.image_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.Linear(args.embed_dim, args.embed_dim)
- )
-
- # self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.Linear(args.embed_dim, args.embed_dim)
- )
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
- # 添加线性映射,将两类特征映射至同一空间
- # self.proj = nn.Sequential
- # self.linear = nn.Linear(64,64)
-
- def forward(self, image, event, label):
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.event_embeding(event)
- encoded_event = F.normalize(self.event_proj(encoded_event), dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- #nce_loss
-
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- onehot = F.one_hot(label.long(),12).to(torch.float)
- sim_targets = onehot @ onehot.t()
-
- sim_t2i = (sim_t2i-sim_t2i.min())/(sim_t2i.max()-sim_t2i.min())
-
- loss_ita = F.mse_loss(sim_t2i, sim_targets)
-
- # loss_ita = (loss_i2t + loss_t2i) / 2
-
- loss1 = F.cross_entropy(output1, label.long())
- loss2 = F.cross_entropy(output2, label.long())
-
- #return loss_ita, loss1, loss2
- return image_token, encoded_event
-
- class MyModule_v8(nn.Module):# 模型合并,+cross_attention
- def __init__(self,args):
- super(MyModule_v8, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- self.semantic_token = Semantic_Token(64, 1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert, args.config_path, args.embed_dim)
-
- self.image_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU()
- )
-
- # self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU()
- )
- # self.event_proj = nn.Linear(args.embed_dim, args.embed_dim)
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
-
- self.cross_att1 = Cross_Attention(args.embed_dim,args.embed_dim)
-
- self.cross_att2 = Cross_Attention(args.embed_dim,args.embed_dim) #0406改
- self.linear = nn.Linear(64*3,args.num_class)
-
- def forward(self, image, event, label):
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = self.image_proj(image_token)
-
- encoded_event = self.event_embeding(event)
- #encoded_event = F.normalize(self.event_proj(encoded_event), dim=1)
- encoded_event = self.event_proj(encoded_event)
- output2 = self.classifier_e(encoded_event)
-
- #预训练对齐部分
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- onehot = F.one_hot(label.long(),12).to(torch.float)
- sim_targets = onehot @ onehot.t()
-
- sim_t2i = (sim_t2i-sim_t2i.min())/(sim_t2i.max()-sim_t2i.min())
-
- loss_ita = F.mse_loss(sim_t2i, sim_targets)
- loss1 = F.cross_entropy(output1, label.long())
- loss2 = F.cross_entropy(output2, label.long())
-
- #return image_token,encoded_event #loss_ita,loss1,loss2
- #交叉注意力机制,图像知识融合部分
- output1 = self.cross_att1(image_token, encoded_event)
- output2 = self.cross_att2(image_token, encoded_event)
-
- output = torch.cat([output1,output2,image_token],dim = 1)
- output = self.linear(output)
- loss_final = F.cross_entropy(output,label.long())
-
- return encoded_event#
- #return loss_final
-
- def KG(self,image_token,kg):
-
- W = torch.mm(image_token,kg.t())
- F = torch.nn.Softmax(1)
- W = F(W)
-
- pos = torch.argmax(W, 1)
- list = []
- for b in range(len(pos)):
- list.append(kg[pos[b], :])
-
- event_ = torch.stack([x for x in list], dim=0)
-
- return event_
-
- def forward_test(self,image,kg):
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.KG(image_token,kg)
- # encoded_event = F.normalize(self.event_proj(encoded_event), dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- #output = self.cross_att(image_token,encoded_event)
- output1 = self.cross_att1(image_token, encoded_event)
- output2 = self.cross_att2(image_token, encoded_event)
-
- output = torch.cat([output1,output2,image_token],dim = 1)
- output = self.linear(output)
-
- # loss_final = F.cross_entropy(output,label.long())
-
- return output
-
- class MyModule_v9(nn.Module):# 分类头后移
- def __init__(self,args):
- super(MyModule_v9, self).__init__()
-
- self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
-
- self.deepfeature = DeepFeature()
-
- self.classifier = nn.Linear(args.embed_dim,args.num_class)#Classifier_head()
- self.classifier_e = Classifier_head_event()
-
- self.semantic_token = Semantic_Token(64, 1)
-
- self.event_embeding = Event_embeding_v3(args.pretrained_bert, args.config_path, args.embed_dim)
-
- self.image_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU()
- )
-
- # self.image_proj = nn.Linear(64, args.embed_dim)
- self.event_proj = nn.Sequential(
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU(),
- nn.Linear(args.embed_dim, args.embed_dim),
- nn.LayerNorm(args.embed_dim),
- nn.ReLU()
- )
- # self.event_proj = nn.Linear(args.embed_dim, args.embed_dim)
-
- self.temp = nn.Parameter(torch.ones([]) * 0.07)
-
- self.cross_att1 = Cross_Attention(args.embed_dim, args.embed_dim)
-
- self.cross_att2 = Cross_Attention(args.embed_dim, args.embed_dim) # 0406改
- self.linear = nn.Linear(64 * 3, args.num_class)
-
- def forward(self, image, event, label):
- batch_size = image.shape[0]
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- # output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = self.image_proj(image_token)
- output1 = self.classifier(image_token)
-
- encoded_event = self.event_embeding(event)
- # encoded_event = F.normalize(self.event_proj(encoded_event), dim=1)
- encoded_event = self.event_proj(encoded_event)
- output2 = self.classifier_e(encoded_event)
-
- # 预训练对齐部分
- sim_i2t = image_token @ encoded_event.t() / self.temp
- sim_t2i = sim_i2t.t()
-
- onehot = F.one_hot(label.long(), 12).to(torch.float)
- sim_targets = onehot @ onehot.t()
-
- sim_t2i = (sim_t2i - sim_t2i.min()) / (sim_t2i.max() - sim_t2i.min())
-
- loss_ita = F.mse_loss(sim_t2i, sim_targets)
- loss1 = F.cross_entropy(output1, label.long())
- loss2 = F.cross_entropy(output2, label.long())
-
- return loss_ita,loss1,loss2
-
- # 交叉注意力机制,图像知识融合部分
- # output1 = self.cross_att1(image_token, encoded_event)
- # output2 = self.cross_att2(image_token, encoded_event)
- #
- # output = torch.cat([output1, output2, image_token], dim=1)
- # output = self.linear(output)
- #
- # loss_final = F.cross_entropy(output, label.long())
- #
- # return loss_final # encoded_event#
-
- def KG(self, image_token, kg):
- W = torch.mm(image_token, kg.t())
- F = torch.nn.Softmax(1)
- W = F(W)
-
- event_ = torch.mm(W, kg)
-
- # pos = torch.argmax(W, 1)
- # list = []
- # for b in range(len(pos)):
- # list.append(kg[pos[b], :])
- #
- # event_ = torch.stack([x for x in list], dim=0)
-
- return event_
-
- def forward_test(self, image, kg):
- with torch.no_grad():
- self.temp.clamp_(0.001, 0.5)
-
- image_features = self.deepfeature(image) # ()
- output1 = self.classifier(image_features)
- image_token = self.semantic_token(image_features)
- image_token = F.normalize(self.image_proj(image_token), dim=1)
-
- encoded_event = self.KG(image_token, kg)
- # encoded_event = F.normalize(self.event_proj(encoded_event), dim=1)
-
- output2 = self.classifier_e(encoded_event)
-
- output = self.cross_att(image_token, encoded_event)
-
- # loss_final = F.cross_entropy(output,label.long())
-
- return output
-
-
- if __name__ == '__main__':
-
- image = torch.randn(8,32,27,27)
- text = torch.randn(8, 768)
- text2 = torch.randn(8, 768)
- text3 = torch.randn(8, 768)
- label = torch.randint(1,13,[8])
-
- # model = DeepFeature()
- # output = model(input)
-
- loss_ = CrossEntropyLoss()
-
- model = MyModuel_v2()
- print(model)
- loss,loss1 = model(loss_,label,image,text,text2,text3)
-
- print(loss)
- print(loss1)
-
- # print(output2.shape)
- # print(output)
|