|
- import numpy as np
- from mxnet.contrib import text
- import torch.utils.data as data_utils
- import torch
- import pandas as pd
- from transformers import AutoTokenizer, AutoModel
- from torch.utils.data import TensorDataset, DataLoader
-
- def load_aapd_data(path, batch_size=64, max_length=200):
- X_tst = np.load(path + "dataset/aapd/X_test.npy")[:, : max_length]
- X_trn = np.load(path + "dataset/aapd/X_train.npy")[:, : max_length]
- Y_trn = np.load(path + "dataset/aapd/y_train.npy")
- Y_tst = np.load(path + "dataset/aapd/y_test.npy")
- label_embed = np.load(path + "dataset/aapd/label_embed.npy")
-
- embed = text.embedding.CustomEmbedding(path + 'dataset/aapd/word_embed.txt')
- train_data = data_utils.TensorDataset(torch.from_numpy(X_trn).type(torch.IntTensor),
- torch.from_numpy(Y_trn).type(torch.IntTensor))
- test_data = data_utils.TensorDataset(torch.from_numpy(X_tst).type(torch.IntTensor),
- torch.from_numpy(Y_tst).type(torch.IntTensor))
- train_loader = data_utils.DataLoader(train_data, batch_size, shuffle=True, drop_last=True)
- test_loader = data_utils.DataLoader(test_data, batch_size, drop_last=True)
- label_num = 54
- return train_loader, test_loader, embed.idx_to_vec.asnumpy(), label_num, label_embed
-
- def load_aapd_bert_data(path, batch_size=64, max_length=200):
- train = pd.read_csv(path + 'dataset/AAPD_raw_data/data/aapd_train.tsv', sep='\t', header=None)
- val = pd.read_csv(path + 'dataset/AAPD_raw_data/data/aapd_validation.tsv', sep='\t', header=None)
- test = pd.read_csv(path + 'dataset/AAPD_raw_data/data/aapd_test.tsv', sep='\t', header=None)
- train = pd.concat([train, val])
- # 提取句子
- train_sent = train[1].tolist()
- test_sent = test[1].tolist()
- # 构建标签
- train_label = [list(map(int, list(i))) for i in train[0].tolist()]
- train_label = np.array(train_label)
- test_label = [list(map(int, list(i))) for i in test[0].tolist()]
- test_label = np.array(test_label)
-
- tokenizer = AutoTokenizer.from_pretrained(path + 'dataset/bert-base-uncased')
- X_train = tokenizer.batch_encode_plus(train_sent, padding=True, truncation=True, max_length=max_length,
- return_tensors='pt')
- y_train = torch.tensor(train_label)
- X_val = tokenizer.batch_encode_plus(test_sent, padding=True, truncation=True, max_length=max_length,
- return_tensors='pt')
- y_val = torch.tensor(test_label)
-
- train_tensor = TensorDataset(X_train['input_ids'], X_train['attention_mask'],
- X_train['token_type_ids'], y_train)
- train_loader = DataLoader(train_tensor, batch_size=batch_size, shuffle=True)
-
- val_tensor = TensorDataset(X_val['input_ids'], X_val['attention_mask'],
- X_val['token_type_ids'], y_val)
- val_loader = DataLoader(val_tensor, batch_size=batch_size, shuffle=False)
-
- label_num = 54
- label_embed = torch.load(path + 'dataset/AAPD_raw_data/bert_AAPD_wiki_label_embedding_type_train_embedding.pt')
-
- return train_loader, val_loader, None, label_num, label_embed
|