|
- # -*- coding: utf-8 -*-
-
- import os
- import pickle
- import numpy as np
- import torch
- from torch.utils.data import Dataset
- from pytorch_pretrained_bert import BertTokenizer
-
-
- def build_tokenizer(fnames, max_seq_len, dat_fname,step=3):
- if os.path.exists(dat_fname):
- print('loading tokenizer:', dat_fname)
- tokenizer = pickle.load(open(dat_fname, 'rb'))
- else:
- text = ''
- for fname in fnames:
- fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
- lines = fin.readlines()
- fin.close()
- for i in range(0, len(lines), step):
- text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
- aspect = lines[i + 1].lower().strip()
- text_raw = text_left + " " + aspect + " " + text_right
- text += text_raw + " "
-
- tokenizer = Tokenizer(max_seq_len)
- tokenizer.fit_on_text(text)
- data_f_name = dat_fname.split('/')
- if len(data_f_name) > 1:
- if not os.path.exists(data_f_name[0]):
- os.mkdir(data_f_name[0])
- pickle.dump(tokenizer, open(dat_fname, 'wb'))
- return tokenizer
-
-
- def _load_word_vec(path, word2idx=None):
- fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
- word_vec = {}
- for line in fin:
- tokens = line.rstrip().split(' ')
- if word2idx is None or tokens[0] in word2idx.keys():
- word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
- return word_vec
-
-
- def build_embedding_matrix(word2idx, embed_dim, dat_fname):
- if os.path.exists(dat_fname):
- print('loading embedding_matrix:', dat_fname)
- embedding_matrix = pickle.load(open(dat_fname, 'rb'))
- else:
- print('loading word vectors...')
- embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim)) # idx 0 and len(word2idx)+1 are all-zeros
- fname = './glove.twitter.27B/glove.twitter.27B.' + str(embed_dim) + 'd.txt' \
- if embed_dim != 300 else '/home/yinrongdi/vector/glove.42B.300d.txt'
- word_vec = _load_word_vec(fname, word2idx=word2idx)
- print('building embedding_matrix:', dat_fname)
- for word, i in word2idx.items():
- vec = word_vec.get(word)
- if vec is not None:
- # words not found in embedding index will be all-zeros.
- embedding_matrix[i] = vec
- pickle.dump(embedding_matrix, open(dat_fname, 'wb'))
- return embedding_matrix
-
-
- def pad_and_truncate(sequence, maxlen, dtype='int64', padding='post', truncating='post', value=0):
- x = (np.ones(maxlen) * value).astype(dtype)
- if truncating == 'pre':
- trunc = sequence[-maxlen:]
- else:
- trunc = sequence[:maxlen]
- trunc = np.asarray(trunc, dtype=dtype)
- if padding == 'post':
- x[:len(trunc)] = trunc
- else:
- x[-len(trunc):] = trunc
- return x
-
-
- class Tokenizer(object):
- def __init__(self, max_seq_len, lower=True):
- self.lower = lower
- self.max_seq_len = max_seq_len
- self.word2idx = {}
- self.idx2word = {}
- self.idx = 1
-
- def fit_on_text(self, text):
- if self.lower:
- text = text.lower()
- words = text.split()
- for word in words:
- if word not in self.word2idx:
- self.word2idx[word] = self.idx
- self.idx2word[self.idx] = word
- self.idx += 1
-
- def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
- if self.lower:
- text = text.lower()
- words = text.split()
- unknownidx = len(self.word2idx)+1
- sequence = [self.word2idx[w] if w in self.word2idx else unknownidx for w in words]
- if len(sequence) == 0:
- sequence = [0]
- if reverse:
- sequence = sequence[::-1]
- return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)
-
-
- class Tokenizer4Bert:
- def __init__(self, max_seq_len, pretrained_bert_name):
- self.tokenizer = BertTokenizer.from_pretrained(pretrained_bert_name)
- self.max_seq_len = max_seq_len
-
- def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
- sequence = self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
- if len(sequence) == 0:
- sequence = [0]
- if reverse:
- sequence = sequence[::-1]
- return pad_and_truncate(sequence, self.max_seq_len, padding=padding, truncating=truncating)
-
-
- class ABSADataset(Dataset):
- def __init__(self, fname, tokenizer):
- fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
- lines = fin.readlines()
- fin.close()
-
- all_data = []
- for i in range(0, len(lines), 3):
- text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
- aspect = lines[i + 1].lower().strip()
- polarity = lines[i + 2].strip()
-
- text_raw_indices = tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)
- text_raw_without_aspect_indices = tokenizer.text_to_sequence(text_left + " " + text_right)
- text_left_indices = tokenizer.text_to_sequence(text_left)
- text_left_with_aspect_indices = tokenizer.text_to_sequence(text_left + " " + aspect)
- text_right_indices = tokenizer.text_to_sequence(text_right, reverse=True)
- text_right_with_aspect_indices = tokenizer.text_to_sequence(" " + aspect + " " + text_right, reverse=True)
- aspect_indices = tokenizer.text_to_sequence(aspect)
- left_context_len = np.sum(text_left_indices != 0)
- aspect_len = np.sum(aspect_indices != 0)
- aspect_in_text = torch.tensor([left_context_len.item(), (left_context_len + aspect_len - 1).item()])
- polarity = int(polarity) + 1
-
- text_bert_indices = tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + aspect + " [SEP]")
- bert_segments_ids = np.asarray([0] * (np.sum(text_raw_indices != 0) + 2) + [1] * (aspect_len + 1))
- bert_segments_ids = pad_and_truncate(bert_segments_ids, tokenizer.max_seq_len)
-
- text_bert_mask = np.asarray([1]*len(bert_segments_ids)+[0]*(tokenizer.max_seq_len-len(bert_segments_ids)))
-
-
- text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text_left + " " + aspect + " " + text_right + " [SEP]")
- aspect_bert_indices = tokenizer.text_to_sequence("[CLS] " + aspect + " [SEP]")
-
- data = {
- 'text_bert_indices': text_bert_indices,
- 'bert_segments_ids': bert_segments_ids,
- 'text_bert_mask': text_bert_mask,
- 'text_raw_bert_indices': text_raw_bert_indices,
- 'aspect_bert_indices': aspect_bert_indices,
- 'text_raw_indices': text_raw_indices,
- 'text_raw_without_aspect_indices': text_raw_without_aspect_indices,
- 'text_left_indices': text_left_indices,
- 'text_left_with_aspect_indices': text_left_with_aspect_indices,
- 'text_right_indices': text_right_indices,
- 'text_right_with_aspect_indices': text_right_with_aspect_indices,
- 'aspect_indices': aspect_indices,
- 'aspect_in_text': aspect_in_text,
- 'polarity': polarity,
- }
-
- all_data.append(data)
- self.data = all_data
-
- def __getitem__(self, index):
- return self.data[index]
-
- def __len__(self):
- return len(self.data)
-
-
-
- class TABSADataset(Dataset):
- def __init__(self, fname, tokenizer, with_absa=True):
- fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
- lines = fin.readlines()
- fin.close()
-
- aspect_dic_15 = {'restaurant general': 0, 'restaurant prices': 1, 'drinks quality': 2, 'location general': 3, 'service general': 4
- , 'food prices': 5, 'restaurant miscellaneous': 6, 'drinks prices': 7, 'drinks style_options': 8
- , 'ambience general': 9, 'food style_options': 10, 'food general': 12, 'food quality': 11}
-
- aspect_dic_16 = {'restaurant general': 0, 'restaurant prices': 1, 'drinks quality': 2, 'location general': 3, 'service general': 4
- , 'food prices': 5, 'restaurant miscellaneous': 6, 'drinks prices': 7, 'drinks style_options': 8
- , 'ambience general': 9, 'food style_options': 10, 'food quality': 11}
-
- classifier_dic = {'general': 0,'miscellaneous': 1, 'prices': 2, 'style_options': 3, 'quality': 4}
-
- all_data = []
- for i in range(0, len(lines), 4):
- text_left, _, text_right = [s.lower().strip() for s in lines[i].partition("$T$")]
- if len(text_left)==0:
- text_left='pad'
- if len(text_right)==0:
- text_right='pad'
- aspect = lines[i + 1].lower().strip()
- if len(aspect)==0:
- continue
- polarity = lines[i + 2].strip()
- Taspect = lines[i+3].lower().strip()
- entity, attribute = Taspect.split(' ')
-
-
-
- text_raw_indices = tokenizer.text_to_sequence(text_left + " " + aspect + " " + text_right)
- text_raw_without_aspect_indices = tokenizer.text_to_sequence(text_left + " " + text_right)
- text_left_indices = tokenizer.text_to_sequence(text_left)
- text_left_with_aspect_indices = tokenizer.text_to_sequence(text_left + " " + aspect)
- text_right_indices = tokenizer.text_to_sequence(text_right, reverse=True)
- text_right_with_aspect_indices = tokenizer.text_to_sequence(" " + aspect + " " + text_right, reverse=True)
- if with_absa:
- aspect_indices = tokenizer.text_to_sequence(Taspect)
- else:
- aspect_indices = tokenizer.text_to_sequence(aspect)
- entity_indices = tokenizer.text_to_sequence(entity)
- attribute_indices = tokenizer.text_to_sequence(attribute)
- left_context_len = np.sum(text_left_indices != 0)
- aspect_len = np.sum(aspect_indices != 0)
- aspect_in_text = torch.tensor([left_context_len.item(), (left_context_len + aspect_len - 1).item()])
- polarity = int(polarity) + 1
-
- text_bert_indices = tokenizer.text_to_sequence('[CLS] ' + text_left + " " + aspect + " " + text_right + ' [SEP] ' + Taspect + " [SEP]")
- bert_segments_ids = np.asarray([0] * (np.sum(text_raw_indices != 0) + 2) + [1] * (aspect_len + 1))
- bert_segments_ids = pad_and_truncate(bert_segments_ids, tokenizer.max_seq_len)
-
- text_bert_mask = np.asarray([1]*len(bert_segments_ids)+[0]*(tokenizer.max_seq_len-len(bert_segments_ids)))
-
- text_raw_bert_indices = tokenizer.text_to_sequence("[CLS] " + text_left + " " + aspect + " " + text_right + " [SEP]")
- aspect_bert_indices = tokenizer.text_to_sequence("[CLS] " + Taspect + " [SEP]")
- entity_bert_indices = tokenizer.text_to_sequence("[CLS] " + entity + " [SEP]")
- attribute_bert_indices = tokenizer.text_to_sequence("[CLS] " + attribute + " [SEP]")
-
-
- classifier_polarity = int(classifier_dic[Taspect.split(' ')[-1]]) +5-1
- # classifier_polarity = 0
- # aspect_category = aspect_dic_16[Taspect]
- data = {
- 'text_bert_indices': text_bert_indices,
- 'bert_segments_ids': bert_segments_ids,
- 'text_bert_mask': text_bert_mask,
- 'text_raw_bert_indices': text_raw_bert_indices,
- 'aspect_bert_indices': aspect_bert_indices,
- 'text_raw_indices': text_raw_indices,
- 'text_raw_without_aspect_indices': text_raw_without_aspect_indices,
- 'text_left_indices': text_left_indices,
- 'text_left_with_aspect_indices': text_left_with_aspect_indices,
- 'text_right_indices': text_right_indices,
- 'text_right_with_aspect_indices': text_right_with_aspect_indices,
- 'aspect_indices': aspect_indices,
- 'aspect_in_text': aspect_in_text,
- 'entity_indices': entity_indices,
- 'attribute_indices': attribute_indices,
- 'entity_bert_indices': entity_bert_indices,
- 'attribute_bert_indices': attribute_bert_indices,
- 'polarity': polarity,
- 'classifier_polarity':classifier_polarity,
- # 'aspect_category': aspect_category,
- }
-
- all_data.append(data)
- self.data = all_data
-
- def __getitem__(self, index):
- return self.data[index]
-
- def __len__(self):
- return len(self.data)
|