|
- import re
- import math
- import json
- import argparse
- import sys
- import time
-
- from tqdm import tqdm
-
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- from torch.utils.data import TensorDataset, DataLoader
- import torchtext
- import os
- import numpy as np
-
- from transformers import AdamW, get_linear_schedule_with_warmup
- from transformers import AutoTokenizer, AutoModel
-
- from sklearn import metrics
- from sklearn.preprocessing import MultiLabelBinarizer
- from torchmetrics import Precision
- import numpy as np
- import matplotlib.pyplot as plt
-
- from nltk.corpus import stopwords
- from nltk.tokenize import word_tokenize
-
- from sentence_transformers import SentenceTransformer
- from nltk.tokenize import sent_tokenize
- import wikipediaapi
-
- import warnings
- warnings.filterwarnings("ignore")
-
-
- def seed_all(seed=42):
- import torch, random, os, numpy
-
- if not seed:
- seed = 10
-
- print("[ Using Seed : ", seed, " ]")
-
- os.environ['PYTHONHASHSEED'] = str(seed)
- torch.manual_seed(seed)
- torch.cuda.manual_seed_all(seed)
- torch.cuda.manual_seed(seed)
- numpy.random.seed(seed)
- random.seed(seed)
- torch.backends.cudnn.deterministic = True
- torch.backends.cudnn.benchmark = False
-
- # log
- class Logger(object):
- def __init__(self, fileN=None):
- self.terminal = sys.stdout
- self.filename = fileN
-
- def write(self, message):
- with open(self.filename, 'a+') as log:
- self.terminal.write(message)
- log.write(message)
-
- def flush(self):
- pass
-
-
- def clean_string(string, args):
- # string = re.sub(r"[^A-Za-z0-9(),!?'`]", " ", string)
- string = re.sub(r"[^A-Za-z0-9,!?]", " ", string)
- string = re.sub(r"\s{2,}", " ", string) # 去除多余的空格
- # 去停用词、截断
- stop_words = set(stopwords.words('english'))
- word_tokens = word_tokenize(string)
- filtered_sentence = [w for w in word_tokens if not w in stop_words]
- filtered_sentence = ' '.join(filtered_sentence[:args.max_length - 227 - 3])
-
- return filtered_sentence.lower().strip()
-
-
- def clean_label_string(string):
- string = string.lower()
- string = string.replace('/', ' ')
- # string = string.split()[0]
- return string
-
-
- def read_dataset(args, train_path, val_path, embedding_type, threshold):
- train = json.load(open(train_path))
- val = json.load(open(val_path))
-
- train_sents = [clean_string(text, args) for text in train['text']]
- val_sents = [clean_string(text, args) for text in val['text']]
-
- train_label_text = [[clean_label_string(s) for s in text] for text in train['label']]
- val_label_text = [[clean_label_string(s) for s in text] for text in val['label']]
-
- mlb = MultiLabelBinarizer()
- train_labels = mlb.fit_transform(train['label'])
- val_labels = mlb.transform(val['label'])
-
- return train_sents, train_labels, val_sents, val_labels, train_label_text, val_label_text
-
-
- def encode_text_label(tokenizer, train_sents, val_sents, train_label_text, val_label_text, max_length):
- # 将所有标签拼接在一起,用空格隔开
- all_label_flatten = [i for item in train_label_text for i in item] # 将所有标签拼接在一起
- all_label_unqiue = list(set(all_label_flatten)) # 去重可以看出标签个数
- all_label_text = ' '.join(all_label_unqiue) # 拼接成了str
- all_label_texts = [all_label_text] * len(val_sents) # 复制到val_sents个数的维度
- # 将train_label_text列表中标签拼接在一起
- train_label_texts = [' '.join(i) for i in train_label_text]
- # 改动 将所有标签放到一起
- all_label_train_texts = [all_label_text] * len(train_sents)
- encoded_train = tokenizer(train_sents, all_label_train_texts, padding=True, truncation=True, max_length=max_length,
- return_tensors='pt')
-
- # for ids in encoded_train["input_ids"]:
- # print(tokenizer.decode(ids))
- # for ids in encoded_train["token_type_ids"]:
- # print(tokenizer.decode(ids))
-
- def statisticians(t):
- index = 0
- for i in t:
- if i == 1:
- break
- index += 1
- return index
-
- train_token_num = [statisticians(token) - 2 for token in encoded_train["token_type_ids"]]
- train_label_num = [sum(token.numpy().tolist()) - 1 for token in encoded_train["token_type_ids"]]
- encoded_val = tokenizer(val_sents, all_label_texts, padding=True, truncation=True, max_length=max_length,
- return_tensors='pt')
- val_token_num = [statisticians(token) - 2 for token in encoded_val["token_type_ids"]]
- val_label_num = [sum(token.numpy().tolist()) - 1 for token in encoded_val["token_type_ids"]]
-
- return encoded_train, encoded_val, torch.Tensor(train_token_num), torch.Tensor(
- train_label_num), torch.Tensor(val_token_num), torch.Tensor(val_label_num)
-
-
- def load_data(args, train_path, val_path, max_length, batch_size, device, embedding_type, threshold):
- tokenizer = AutoTokenizer.from_pretrained('pre_model/' + args.pre_model_type)
- train_sents, train_labels, val_sents, val_labels, train_label_text, val_label_text = read_dataset(args,
- train_path, val_path, embedding_type, threshold)
-
- # 统计所有标签个数、记录train中每一个例子的单词数、
- X_train, X_val, train_token_num, train_label_num, val_token_num, val_label_num = encode_text_label(tokenizer,
- train_sents,
- val_sents, train_label_text,
- val_label_text, max_length)
-
- # X_train = tokenizer.batch_encode_plus(train_sents, padding=True, truncation=True, max_length=max_length,
- # return_tensors='pt')
- # X_val = tokenizer.batch_encode_plus(val_sents, padding=True, truncation=True, max_length=max_length,
- # return_tensors='pt')
-
- y_train = torch.tensor(train_labels)
- y_val = torch.tensor(val_labels)
-
- train_tensor = TensorDataset(X_train['input_ids'].to(device), X_train['attention_mask'].to(device),
- X_train['token_type_ids'].to(device), train_token_num.to(device),
- train_label_num.to(device), y_train.to(device))
- train_loader = DataLoader(train_tensor, batch_size=batch_size, shuffle=True)
-
- val_tensor = TensorDataset(X_val['input_ids'].to(device), X_val['attention_mask'].to(device),
- X_val['token_type_ids'].to(device), val_token_num.to(device),
- val_label_num.to(device), y_val.to(device))
- val_loader = DataLoader(val_tensor, batch_size=batch_size, shuffle=False)
-
- return train_loader, val_loader
-
-
- class EarlyStopping:
- def __init__(self, patience=5, delta=0):
- self.patience = patience
- self.counter = 0
- self.best_score = None
- self.early_stop = False
- self.val_loss_min = np.Inf
- self.delta = delta
-
- def __call__(self, val_loss):
-
- score = -val_loss
- if self.best_score is None:
- self.best_score = score
- elif score < self.best_score + self.delta:
- self.counter += 1
- if self.counter >= self.patience:
- self.early_stop = True
- else:
- self.best_score = score
- self.counter = 0
-
-
- class BertAttention(nn.Module):
- def __init__(self, args):
- super(BertAttention, self).__init__()
- self.bert = AutoModel.from_pretrained('pre_model/' + args.pre_model_type, output_hidden_states=True)
- self.dropout = nn.Dropout(0.2)
- self.myW = nn.Parameter(
- torch.FloatTensor(args.batch_size, self.bert.config.hidden_size, self.bert.config.hidden_size))
- torch.nn.init.xavier_uniform_(self.myW)
-
- self.WK = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
- self.WV = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
-
- self.linear = nn.Linear(self.bert.config.hidden_size, 101)
-
- def forward(self, input_ids, attention_mask, token_type_ids, token_num, label_num):
- bert_output = self.bert(input_ids, token_type_ids, attention_mask)
- # bert_output = bert_output['last_hidden_state']
- # bert_output = self.dropout(bert_output)
- bert_output = bert_output.hidden_states
- bert_output = (bert_output[-1] + bert_output[1])
- batch_size = bert_output.size(0)
- dim = bert_output.size(-1)
-
- # 提取Text Embedding 和 Label Embedding
- Text_reps = []
- label_reps = []
- token_num = [int(i) for i in token_num.cpu().numpy().tolist()]
- label_num = [int(i) for i in label_num.cpu().numpy().tolist()]
- for ites in range(batch_size):
- support_i = torch.index_select(input=bert_output, dim=0,
- index=torch.LongTensor([ites]).to(bert_output.device)).squeeze(0)
- reps_i = support_i.narrow(-2, 1, token_num[ites])
- label_reps_i = support_i.narrow(-2, 2 + token_num[ites], label_num[ites]).mean(dim=-2, keepdim=True)
- # 实现Attention
- myW = torch.index_select(input=self.myW, dim=0,
- index=torch.LongTensor([ites]).to(bert_output.device)).squeeze(0)
- A = torch.matmul(torch.matmul(reps_i, myW), label_reps_i.permute(1, 0))
- B = F.softmax(A, dim=0)
- B = B.permute(1, 0)
- final_embed = torch.matmul(B, reps_i)
-
- Text_reps.append(final_embed)
- # label_reps.append(label_reps_i)
- Text_reps = torch.stack(Text_reps)
- # label_reps = torch.stack(label_reps)
-
- # Self-attention
- Q = Text_reps
- K = self.WK(bert_output)
- V = self.WV(bert_output)
- score = torch.bmm(Q, K.permute(0, 2, 1)) / math.sqrt(dim)
- p = F.softmax(score, -1)
- output = torch.bmm(p, V).squeeze(1)
-
- output = self.linear(output)
-
- return output
-
-
- # Init model and optimizer & schedule
- def initialize_model(device, len_trainloader, epochs, args, lr=3e-5):
- model = BertAttention(args)
- model.to(device)
-
- no_decay = ['bias', 'LayerNorm.weight']
- param_optimizer = [[name, para] for name, para in model.named_parameters() if para.requires_grad]
- optimizer_grouped_parameters = [
- {'params': [param for name, param in param_optimizer if not any(nd in name for nd in no_decay)],
- 'weight_decay': 0.01},
- {'params': [param for name, param in param_optimizer if any(nd in name for nd in no_decay)],
- 'weight_decay': 0.0}
- ]
-
- optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
- n_steps = len_trainloader * epochs
- scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_training_steps=n_steps, num_warmup_steps=100)
- criterion = nn.BCEWithLogitsLoss()
-
- return model, optimizer, scheduler, criterion
-
-
- def step(model, optimizer, scheduler, criterion, batch):
- input_ids, attention_mask, token_type_ids, train_token_num, train_label_num, label = batch
- optimizer.zero_grad()
- y_pred = model.forward(input_ids, attention_mask, token_type_ids, train_token_num, train_label_num)
-
- loss = criterion(y_pred, label.float())
- loss.backward()
-
- optimizer.step()
- scheduler.step()
-
- return loss.item()
-
-
- def validate(model, criterion, val_loader):
- print("Evaluating...")
- model.eval()
- with torch.no_grad():
- running_loss = 0.0
- pred_labels, targets = list(), list()
-
- for _, batch in enumerate(val_loader):
- input_ids, attention_mask, token_type_ids, val_token_num, train_label_num, y_true = batch
- output = model(input_ids, attention_mask, token_type_ids, val_token_num, train_label_num)
- loss = criterion(output, y_true.float())
-
- running_loss += loss.item()
-
- pred_labels.extend(torch.sigmoid(output).detach().cpu().numpy())
- targets.extend(y_true.detach().cpu().numpy())
-
- val_loss = running_loss / len(val_loader)
-
- pred_labels, targets = np.array(pred_labels), np.array(targets)
- accuracy = metrics.accuracy_score(targets, pred_labels.round())
- micro_f1 = metrics.f1_score(targets, pred_labels.round(), average='micro')
- macro_f1 = metrics.f1_score(targets, pred_labels.round(), average='macro')
-
- ndcg1 = metrics.ndcg_score(targets, pred_labels, k=1)
- ndcg3 = metrics.ndcg_score(targets, pred_labels, k=3)
- ndcg5 = metrics.ndcg_score(targets, pred_labels, k=5)
-
- p1 = Precision(num_classes=101, top_k=1)(torch.tensor(pred_labels), torch.tensor(targets))
- p3 = Precision(num_classes=101, top_k=3)(torch.tensor(pred_labels), torch.tensor(targets))
- p5 = Precision(num_classes=101, top_k=5)(torch.tensor(pred_labels), torch.tensor(targets))
-
- return val_loss, accuracy, micro_f1, macro_f1, ndcg1, ndcg3, ndcg5, p1, p3, p5
-
-
- def train(model, optimizer, scheduler, criterion, train_loader, val_loader, checkpoint, epochs=20):
- early_stopping = EarlyStopping(delta=1e-5, patience=10)
- train_losses, val_losses, val_accs = [], [], []
-
- for epoch in range(epochs):
- running_loss = 0.0
- model.train()
- for i, batch in enumerate(train_loader):
- loss = step(model, optimizer, scheduler, criterion, batch)
- running_loss += loss
- if (i + 1) % 100 == 0 or i == 0:
- print("Epoch: {} - iter: {}/{} - train_loss: {}".format(epoch + 1, i + 1, len(train_loader),
- running_loss / (i + 1)))
- else:
- print("Epoch: {} - iter: {}/{} - train_loss: {}".format(epoch + 1, i + 1, len(train_loader),
- running_loss / len(train_loader)))
- val_loss, accuracy, micro_f1, macro_f1, ndcg1, ndcg3, ndcg5, p1, p3, p5 = validate(model, criterion,
- val_loader)
-
- train_losses.append(running_loss / (i + 1))
- val_losses.append(val_loss), val_accs.append(accuracy)
- print("Val_loss: {} - Accuracy: {} - Micro-F1: {} - Macro-F1: {}".format(val_loss, accuracy, micro_f1,
- macro_f1))
- print(
- "nDCG1: {} - nDCG@3: {} - nDCG@5: {} - P@1: {} - P@3: {} - P@5: {}".format(ndcg1, ndcg3, ndcg5, p1, p3,
- p5))
-
- early_stopping(val_loss)
- if early_stopping.early_stop:
- print('Early stoppping. Previous model saved in: ', checkpoint)
- train_losses, val_losses, val_accs = np.array(train_losses).reshape(-1, 1), np.array(
- val_losses).reshape(-1, 1), np.array(val_accs).reshape(-1, 1)
- np.savetxt(os.path.join(checkpoint, 'log.txt'), np.hstack((train_losses, val_losses, val_accs)),
- delimiter='#')
- break
- torch.save({
- 'epoch': epoch + 1,
- 'model_state_dict': model.state_dict(),
- # 'optimizer_state_dict': optimizer.state_dict(),
- # 'scheduler': scheduler.state_dict(),
- 'val_loss': val_loss
- }, os.path.join(checkpoint, 'cp' + str(epoch + 1) + '.pt'))
-
- train_losses, val_losses, val_accs = np.array(train_losses).reshape(-1, 1), np.array(val_losses).reshape(-1, 1), \
- np.array(val_accs).reshape(-1, 1)
- np.savetxt(os.path.join(checkpoint, 'log.txt'), np.hstack((train_losses, val_losses, val_accs)), delimiter='#')
-
-
- def main():
- parser = argparse.ArgumentParser(description='RCV1 Classification')
-
- parser.add_argument('--model_name', type=str, default='train1', help='model name')
- parser.add_argument('--train_data', type=str, default='data/rcv1_train_data.json',
- help='The train dataset directory.')
- parser.add_argument('--val_data', type=str, default='data/rcv1_val_data.json', help='The val dataset directory')
- parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
- parser.add_argument('--batch_size', type=int, default=16, help='batch size')
- parser.add_argument('--epochs', type=int, default=10, help='number epochs')
- parser.add_argument('--max_length', type=int, default=484, help='max sequence length')
- parser.add_argument('--embedding_type', type=str, default='wiki',
- help='type of the word embeding: wiki, random, glove, fasttext, ggnews')
- parser.add_argument('--checkpoint', type=str, default='checkpoint', help='check point')
- # parser.add_argument('--resume', type=int, default=0, help='resume train model from checkpoint')
- parser.add_argument('--graph_feature', type=str, default='./data/graph_feature.pth',
- help='path to feature of graph: adjacency, node feature')
- parser.add_argument('--threshold', type=float, default=0.0)
- parser.add_argument('--pre_model_type', type=str, default='bert-base-uncased',
- help='type of the model: roberta-base, bert-base-uncased')
- parser.add_argument('--device', type=str, default='cpu', help='cuda or cpu')
- parser.add_argument('--log', type=str, default='True', help='True or False')
- args = parser.parse_args()
-
- args.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
-
- # log
- time_now = time.strftime('%Y-%m-%d_%H-%M-%S', time.localtime(time.time()))
- if args.log == 'True':
- sys.stdout = Logger(args.checkpoint + '/' + args.model_name + '+' + time_now + '.txt')
-
- seed_all()
-
- # read dataset
- print('reading dataset...')
- train_loader, val_loader = load_data(args=args, train_path=args.train_data,
- val_path=args.val_data,
- max_length=args.max_length, batch_size=args.batch_size,
- device=args.device, embedding_type=args.embedding_type,
- threshold=args.threshold)
-
- print('initialize model')
- model, optimizer, scheduler, criterion = initialize_model(device=args.device,
- len_trainloader=len(train_loader), epochs=args.epochs,
- args=args, lr=args.lr)
-
- print('training model...')
- train(model=model, optimizer=optimizer, scheduler=scheduler, criterion=criterion, train_loader=train_loader,
- val_loader=val_loader, checkpoint=args.checkpoint, epochs=args.epochs)
-
-
- if __name__ == '__main__':
- main()
-
- # 将句子截断 # 去除停用词
- #
- # 将标签拼在句子后面当做sgmentB
|