|
- # Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import paddle
- import paddle.nn as nn
- import codecs
- import json
- import numpy as np
- import pandas as pd
-
-
- class BCELossForDuIE(nn.Layer):
- def __init__(self, ):
- super(BCELossForDuIE, self).__init__()
- self.criterion = nn.BCEWithLogitsLoss(reduction='none')
-
- def forward(self, logits, labels, mask):
- loss = self.criterion(logits, labels)
- mask = paddle.cast(mask, 'float32')
- loss = loss * mask.unsqueeze(-1)
- loss = paddle.sum(loss.mean(axis=2), axis=1) / paddle.sum(mask, axis=1)
- loss = loss.mean()
- return loss
-
-
- def get_optimizer(model, lr, weight_decay):
- optimizer = paddle.optimizer.AdamW(
- learning_rate=lr,
- parameters=model.parameters(),
- weight_decay=weight_decay,
- apply_decay_param_fun=lambda x: x in [
- p.name for n, p in model.named_parameters()
- if not any(nd in n for nd in ["bias", "norm"])
- ]
- )
-
- return optimizer
-
-
- def find_entity(text_raw, id_, predictions, tok_to_orig_start_index, tok_to_orig_end_index, rel_num):
- """
- retrieval entity mention under given predicate id for certain prediction.
- this is called by the "decoding" func.
- """
- entity_list = []
- for i in range(len(predictions)):
- if [id_] in predictions[i]:
- j = 0 # 从这个字开始往后找,下标为id + rel_num对应的标签为I
- while i + j + 1 < len(predictions):
- if [id_ + rel_num] in predictions[i + j + 1]:
- j += 1
- else:
- break
- entity = ''.join(text_raw[tok_to_orig_start_index[i]: tok_to_orig_end_index[i + j] + 1])
- entity_list.append(entity)
-
- # 得到的所有的可能的实体列表去重
- return list(set(entity_list))
-
-
- def decoding(example_batch, id2spo, subject_logits_batch, object_logits_batch, seq_len_batch, tok_to_orig_start_index_batch, tok_to_orig_end_index_batch):
- """
- model output logits -> formatted spo (as in data set file)
- """
- formatted_outputs = []
- for (i, (example, subject_logits, object_logits, seq_len, tok_to_orig_start_index, tok_to_orig_end_index)) in enumerate(zip(example_batch, subject_logits_batch, object_logits_batch, seq_len_batch, tok_to_orig_start_index_batch, tok_to_orig_end_index_batch)):
-
- subject_logits = subject_logits[1:seq_len + 1] # slice between [CLS] and [SEP] to get valid logits
- subject_logits[subject_logits >= 0.5] = 1
- subject_logits[subject_logits < 0.5] = 0
-
- object_logits = object_logits[1:seq_len + 1] # slice between [CLS] and [SEP] to get valid logits
- object_logits[object_logits >= 0.5] = 1
- object_logits[object_logits < 0.5] = 0
-
- tok_to_orig_start_index = tok_to_orig_start_index[1:seq_len + 1]
- tok_to_orig_end_index = tok_to_orig_end_index[1:seq_len + 1]
-
- subject_predictions = []
- object_predictions = []
-
- # logits: shape为[seq_len, predicate_num]
- # 对每一个字,保留logits=1对应的下标,据此找到标签
- for token in subject_logits:
- subject_predictions.append(np.argwhere(token == 1).tolist())
-
- for token in object_logits:
- object_predictions.append(np.argwhere(token == 1).tolist())
-
- # format predictions into example-style output
- formatted_instance = {}
- text_raw = example["text"]
-
- # flatten predictions then retrieval all valid subject id
- # 扁平化保留的下标数据集合,然后检索所有有效的头实体id
- subject_flatten_predictions = []
- for layer_1 in subject_predictions:
- for layer_2 in layer_1:
- subject_flatten_predictions.append(layer_2[0])
-
- object_flatten_predictions = []
- for layer_1 in object_predictions:
- for layer_2 in layer_1:
- object_flatten_predictions.append(layer_2[0])
-
- id_list = []
- for cls_label in list(set(subject_flatten_predictions) & set(object_flatten_predictions)):
- # “+relation_num”的含义为:有与之匹配的尾实体标签
- if 0 < cls_label < len(id2spo["predicate"]) - 1:
- id_list.append(cls_label)
- id_list = list(set(id_list))
-
- # fetch all valid spo by subject id
- spo_list = []
- for id_ in id_list:
- subjects = find_entity(text_raw, id_,
- subject_predictions,
- tok_to_orig_start_index,
- tok_to_orig_end_index,
- len(id2spo["predicate"]) - 2)
- objects = find_entity(text_raw, id_,
- object_predictions,
- tok_to_orig_start_index,
- tok_to_orig_end_index,
- len(id2spo["predicate"]) - 2)
-
- # 同类型的都匹配一遍(没有依据句子匹配)
- for subject_ in subjects:
- for object_ in objects:
- # 在构造dataloader的时候,predicate的id-1,所以需要重新加回来
- spo_list.append({
- "predicate": id2spo['predicate'][id_ + 1],
- "object_type": id2spo['object_type'][id_ + 1],
- 'subject_type': id2spo['subject_type'][id_ + 1],
- "object": object_,
- "subject": subject_
- })
-
- formatted_instance['text'] = example['text']
- formatted_instance['spo_list'] = spo_list
- formatted_outputs.append(formatted_instance)
- return formatted_outputs
-
-
- def drop_spo_duplicates(spo_list):
- '''
- 标注spo_list去重
- '''
- alist = pd.DataFrame(spo_list).drop_duplicates(subset=None, keep='first', inplace=False)
- data_array = np.array(alist)
- # 然后转化为list形式
- data_list = data_array.tolist()
- return data_list
-
-
- def write_content(formatted_outputs, file_path):
- with codecs.open(file_path, 'w', 'utf-8') as f:
- for formatted_instance in formatted_outputs:
- json_str = json.dumps(formatted_instance, ensure_ascii=False)
- f.write(json_str)
- f.write('\n')
-
-
- def write_compare_result(example_all, pred_outputs, compare_file):
- '''
- 对比预测结果,整合到同一个json中
- init_file为原始验证集,predict_file为模型在验证集上的预测结果
- compare_file为上述两个文件对比整合后的结果
- '''
-
- dev_init = example_all
- dev_pred = pred_outputs
- results = []
-
- for i in range(len(dev_pred)):
- text = dev_init[i]['text']
- spo_list = []
- spo_list_pred = []
- for spo in dev_init[i]['spo_list']:
- spo_list.append([spo['subject'], spo['predicate'], spo['object']])
- for spo1 in dev_pred[i]['spo_list']:
- spo_list_pred.append([spo1['subject'], spo1['predicate'], spo1['object']])
-
- # 如果太多了,说明模型预测的不好,只保留最近的100条
- if len(spo_list_pred) > 100:
- spo_list_pred = spo_list_pred[:100]
-
- new = [x for x in spo_list_pred if x not in spo_list]
- lack = [x for x in spo_list if x not in spo_list_pred]
-
- results.append({
- "text": text,
- "spo_list": spo_list,
- "spo_list_pred": spo_list_pred,
- "new": new,
- "lack": lack
- })
-
- with open(compare_file, 'w', encoding='utf-8') as f:
- json.dump(results, f, indent=4, ensure_ascii=False)
-
- #
- # def get_precision_recall_f1_v1(compare_file, match_pattern):
- # '''
- # 重写原始Paddle模型使用的评分函数,严格匹配获得模型得分
- # '''
- # p_count = 0
- # r_count = 0
- # p_sum = 0
- # r_sum = 0
- #
- # with open(compare_file, 'r', encoding="utf-8") as file:
- # text = file.read()
- # text = json.loads(text)
- # df = pd.json_normalize(text)
- #
- # for index, row in df.iterrows():
- # spo_list = row['spo_list']
- # # 原始标注spo去重
- # spo_list = drop_spo_duplicates(spo_list)
- # spo_list_pred = row['spo_list_pred']
- # p_sum += len(spo_list_pred)
- # r_sum += len(spo_list)
- # '''
- # recall 算为匹配的次数
- # '''
- # for spo in spo_list:
- # for spo_pred in spo_list_pred:
- # if spo[1] == spo_pred[1]:
- # if match_pattern == "whole_text":
- # spo_pred_s = spo_pred[0]
- # spo_pred_o = spo_pred[2]
- # spo_s = spo[0]
- # spo_o = spo[2]
- # elif match_pattern == "only_head_text":
- # spo_pred_s = spo_pred[0].split(" ")[0]
- # spo_pred_o = spo_pred[2].split(" ")[0]
- # spo_s = spo[0].split(" ")[0]
- # spo_o = spo[2].split(" ")[0]
- # else:
- # raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")
- #
- # # 完全匹配
- # if (spo_pred_s == spo_s) and (spo_pred_o == spo_o):
- # r_count += 1
- #
- # '''
- # precision 算为匹配的个数
- # '''
- # for spo_pred in spo_list_pred:
- # flag = 0
- # for spo in spo_list:
- # if spo[1] == spo_pred[1]:
- # if match_pattern == "whole_text":
- # spo_pred_s = spo_pred[0]
- # spo_pred_o = spo_pred[2]
- # spo_s = spo[0]
- # spo_o = spo[2]
- # elif match_pattern == "only_head_text":
- # spo_pred_s = spo_pred[0].split(" ")[0]
- # spo_pred_o = spo_pred[2].split(" ")[0]
- # spo_s = spo[0].split(" ")[0]
- # spo_o = spo[2].split(" ")[0]
- # else:
- # raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")
- #
- # # 完全匹配
- # if (spo_pred_s == spo_s) and (spo_pred_o == spo_o) and flag == 0:
- # flag = 1
- # p_count += 1
- #
- # # 计算
- # precision = p_count / p_sum if p_sum > 0 else 0.0
- # recall = r_count / r_sum if r_sum > 0 else 0.0
- # f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
- # return precision, recall, f1
- #
- #
-
- def get_precision_recall_f1_v1(compare_file, match_pattern):
- '''
- 重写原始Paddle模型使用的评分函数,严格匹配获得模型得分
- '''
- correct_num, pred_num, gold_num = 0, 0, 0
-
- with open(compare_file, 'r', encoding="utf-8") as file:
- text = file.read()
- text = json.loads(text)
- df = pd.json_normalize(text)
-
- for index, row in df.iterrows():
- spo_list = row['spo_list']
- spo_list = drop_spo_duplicates(spo_list)
- spo_list_pred = row['spo_list_pred']
-
- if match_pattern == "only_head_text":
- spo_list = [[spo[0].split(" ")[0], spo[1], spo[2].split(" ")[0]] for spo in spo_list]
- spo_list_pred = [[spo[0].split(" ")[0], spo[1], spo[2].split(" ")[0]] for spo in spo_list_pred]
- elif match_pattern != "whole_text":
- raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")
-
- for rel_str in spo_list_pred:
- if rel_str in spo_list:
- correct_num += 1
-
- pred_num += len(spo_list_pred)
- gold_num += len(spo_list)
-
- minimini = 1e-10
- precision = correct_num / (pred_num + minimini)
- recall = correct_num / (gold_num + minimini)
- f1 = 2 * precision * recall / (precision + recall + minimini)
-
- return precision, recall, f1
-
-
- def get_model_size(model, framework):
- if framework == "paddlepaddle":
- import numpy as np
-
- # 定义总参数量、可训练参数量及非可训练参数量变量
- Total_params = 0
-
- # 获取参数情况
- for p in model.parameters():
- mulValue = np.prod(p.shape) # 使用numpy prod接口计算数组所有元素之积
- Total_params += mulValue # 总参数量
-
- return f'Total params: {Total_params / 1024 / 1024}'
- elif framework == "pytorch":
- param_size = 0
- param_sum = 0
- for param in model.parameters():
- param_size += param.nelement() * param.element_size()
- param_sum += param.nelement()
- buffer_size = 0
- buffer_sum = 0
- for buffer in model.buffers():
- buffer_size += buffer.nelement() * buffer.element_size()
- buffer_sum += buffer.nelement()
- # all_size = (param_size + buffer_size) / 1024 / 1024
- # print((param_size, param_sum , buffer_size, buffer_sum, all_size))
- return f'Total params: {param_sum / 1024 / 1024}'
- elif framework == "tensorflow":
- import numpy as np
- import tensorflow as tf
-
- para_num = sum([np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
- # para_size: 参数个数 * 每个4字节(float32) / 1024 / 1024,单位为 MB
- para_size = para_num * 4 / 1024 / 1024
- return f'Total params: {para_size}'
- elif framework == "keras":
- import numpy as np
-
- para_num = sum([np.prod(w.shape) for w in model.get_weights()])
- # para_size: 参数个数 * 每个4字节(float32) / 1024 / 1024,单位为 MB
- para_size = para_num * 4 / 1024 / 1024
- return f'Total params: {para_size}'
|