1194872850
/
NumSeven

 
			
							# Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import codecs
import json
import numpy as np
import pandas as pd


class BCELossForDuIE(nn.Layer):
    def __init__(self, ):
        super(BCELossForDuIE, self).__init__()
        self.criterion = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, logits, labels, mask):
        loss = self.criterion(logits, labels)
        mask = paddle.cast(mask, 'float32')
        loss = loss * mask.unsqueeze(-1)
        loss = paddle.sum(loss.mean(axis=2), axis=1) / paddle.sum(mask, axis=1)
        loss = loss.mean()
        return loss


def get_optimizer(model, lr, weight_decay):
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr,
        parameters=model.parameters(),
        weight_decay=weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
    )

    return optimizer


def find_entity(text_raw, id_, predictions, tok_to_orig_start_index, tok_to_orig_end_index, rel_num):
    """
    retrieval entity mention under given predicate id for certain prediction.
    this is called by the "decoding" func.
    """
    entity_list = []
    for i in range(len(predictions)):
        if [id_] in predictions[i]:
            j = 0  # 从这个字开始往后找，下标为id + rel_num对应的标签为I
            while i + j + 1 < len(predictions):
                if [id_ + rel_num] in predictions[i + j + 1]:
                    j += 1
                else:
                    break
            entity = ''.join(text_raw[tok_to_orig_start_index[i]: tok_to_orig_end_index[i + j] + 1])
            entity_list.append(entity)

    # 得到的所有的可能的实体列表去重
    return list(set(entity_list))


def decoding(example_batch, id2spo, subject_logits_batch, object_logits_batch, seq_len_batch, tok_to_orig_start_index_batch, tok_to_orig_end_index_batch):
    """
    model output logits -> formatted spo (as in data set file)
    """
    formatted_outputs = []
    for (i, (example, subject_logits, object_logits, seq_len, tok_to_orig_start_index, tok_to_orig_end_index)) in enumerate(zip(example_batch, subject_logits_batch, object_logits_batch, seq_len_batch, tok_to_orig_start_index_batch, tok_to_orig_end_index_batch)):

        subject_logits = subject_logits[1:seq_len + 1]  # slice between [CLS] and [SEP] to get valid logits
        subject_logits[subject_logits >= 0.5] = 1
        subject_logits[subject_logits < 0.5] = 0

        object_logits = object_logits[1:seq_len + 1]  # slice between [CLS] and [SEP] to get valid logits
        object_logits[object_logits >= 0.5] = 1
        object_logits[object_logits < 0.5] = 0

        tok_to_orig_start_index = tok_to_orig_start_index[1:seq_len + 1]
        tok_to_orig_end_index = tok_to_orig_end_index[1:seq_len + 1]

        subject_predictions = []
        object_predictions = []

        # logits: shape为[seq_len, predicate_num]
        # 对每一个字，保留logits=1对应的下标，据此找到标签
        for token in subject_logits:
            subject_predictions.append(np.argwhere(token == 1).tolist())

        for token in object_logits:
            object_predictions.append(np.argwhere(token == 1).tolist())

        # format predictions into example-style output
        formatted_instance = {}
        text_raw = example["text"]

        # flatten predictions then retrieval all valid subject id
        # 扁平化保留的下标数据集合，然后检索所有有效的头实体id
        subject_flatten_predictions = []
        for layer_1 in subject_predictions:
            for layer_2 in layer_1:
                subject_flatten_predictions.append(layer_2[0])

        object_flatten_predictions = []
        for layer_1 in object_predictions:
            for layer_2 in layer_1:
                object_flatten_predictions.append(layer_2[0])

        id_list = []
        for cls_label in list(set(subject_flatten_predictions) & set(object_flatten_predictions)):
            # “+relation_num”的含义为：有与之匹配的尾实体标签
            if 0 < cls_label < len(id2spo["predicate"]) - 1:
                id_list.append(cls_label)
        id_list = list(set(id_list))

        # fetch all valid spo by subject id
        spo_list = []
        for id_ in id_list:
            subjects = find_entity(text_raw, id_,
                                                    subject_predictions,
                                                    tok_to_orig_start_index,
                                                    tok_to_orig_end_index,
                                                    len(id2spo["predicate"]) - 2)
            objects = find_entity(text_raw, id_,
                                                   object_predictions,
                                                   tok_to_orig_start_index,
                                                   tok_to_orig_end_index,
                                                   len(id2spo["predicate"]) - 2)

            # 同类型的都匹配一遍（没有依据句子匹配）
            for subject_ in subjects:
                for object_ in objects:
                    # 在构造dataloader的时候，predicate的id-1，所以需要重新加回来
                    spo_list.append({
                        "predicate": id2spo['predicate'][id_ + 1],
                        "object_type": id2spo['object_type'][id_ + 1],
                        'subject_type': id2spo['subject_type'][id_ + 1],
                        "object": object_,
                        "subject": subject_
                    })

        formatted_instance['text'] = example['text']
        formatted_instance['spo_list'] = spo_list
        formatted_outputs.append(formatted_instance)
    return formatted_outputs


def drop_spo_duplicates(spo_list):
    '''
    标注spo_list去重
    '''
    alist = pd.DataFrame(spo_list).drop_duplicates(subset=None, keep='first', inplace=False)
    data_array = np.array(alist)
    # 然后转化为list形式
    data_list = data_array.tolist()
    return data_list


def write_content(formatted_outputs, file_path):
    with codecs.open(file_path, 'w', 'utf-8') as f:
        for formatted_instance in formatted_outputs:
            json_str = json.dumps(formatted_instance, ensure_ascii=False)
            f.write(json_str)
            f.write('\n')


def write_compare_result(example_all, pred_outputs, compare_file):
    '''
    对比预测结果，整合到同一个json中
    init_file为原始验证集，predict_file为模型在验证集上的预测结果
    compare_file为上述两个文件对比整合后的结果
    '''

    dev_init = example_all
    dev_pred = pred_outputs
    results = []

    for i in range(len(dev_pred)):
        text = dev_init[i]['text']
        spo_list = []
        spo_list_pred = []
        for spo in dev_init[i]['spo_list']:
            spo_list.append([spo['subject'], spo['predicate'], spo['object']])
        for spo1 in dev_pred[i]['spo_list']:
            spo_list_pred.append([spo1['subject'], spo1['predicate'], spo1['object']])

        # 如果太多了，说明模型预测的不好，只保留最近的100条
        if len(spo_list_pred) > 100:
            spo_list_pred = spo_list_pred[:100]

        new = [x for x in spo_list_pred if x not in spo_list]
        lack = [x for x in spo_list if x not in spo_list_pred]

        results.append({
            "text": text,
            "spo_list": spo_list,
            "spo_list_pred": spo_list_pred,
            "new": new,
            "lack": lack
        })

    with open(compare_file, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)

#
# def get_precision_recall_f1_v1(compare_file, match_pattern):
#     '''
#     重写原始Paddle模型使用的评分函数，严格匹配获得模型得分
#     '''
#     p_count = 0
#     r_count = 0
#     p_sum = 0
#     r_sum = 0
#
#     with open(compare_file, 'r', encoding="utf-8") as file:
#         text = file.read()
#         text = json.loads(text)
#         df = pd.json_normalize(text)
#
#     for index, row in df.iterrows():
#         spo_list = row['spo_list']
#         # 原始标注spo去重
#         spo_list = drop_spo_duplicates(spo_list)
#         spo_list_pred = row['spo_list_pred']
#         p_sum += len(spo_list_pred)
#         r_sum += len(spo_list)
#         '''
#         recall 算为匹配的次数
#         '''
#         for spo in spo_list:
#             for spo_pred in spo_list_pred:
#                 if spo[1] == spo_pred[1]:
#                     if match_pattern == "whole_text":
#                         spo_pred_s = spo_pred[0]
#                         spo_pred_o = spo_pred[2]
#                         spo_s = spo[0]
#                         spo_o = spo[2]
#                     elif match_pattern == "only_head_text":
#                         spo_pred_s = spo_pred[0].split(" ")[0]
#                         spo_pred_o = spo_pred[2].split(" ")[0]
#                         spo_s = spo[0].split(" ")[0]
#                         spo_o = spo[2].split(" ")[0]
#                     else:
#                         raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")
#
#                     # 完全匹配
#                     if (spo_pred_s == spo_s) and (spo_pred_o == spo_o):
#                         r_count += 1
#
#         '''
#         precision 算为匹配的个数
#         '''
#         for spo_pred in spo_list_pred:
#             flag = 0
#             for spo in spo_list:
#                 if spo[1] == spo_pred[1]:
#                     if match_pattern == "whole_text":
#                         spo_pred_s = spo_pred[0]
#                         spo_pred_o = spo_pred[2]
#                         spo_s = spo[0]
#                         spo_o = spo[2]
#                     elif match_pattern == "only_head_text":
#                         spo_pred_s = spo_pred[0].split(" ")[0]
#                         spo_pred_o = spo_pred[2].split(" ")[0]
#                         spo_s = spo[0].split(" ")[0]
#                         spo_o = spo[2].split(" ")[0]
#                     else:
#                         raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")
#
#                     # 完全匹配
#                     if (spo_pred_s == spo_s) and (spo_pred_o == spo_o) and flag == 0:
#                         flag = 1
#                         p_count += 1
#
#     # 计算
#     precision = p_count / p_sum if p_sum > 0 else 0.0
#     recall = r_count / r_sum if r_sum > 0 else 0.0
#     f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0.0
#     return precision, recall, f1
#
#

def get_precision_recall_f1_v1(compare_file, match_pattern):
    '''
    重写原始Paddle模型使用的评分函数，严格匹配获得模型得分
    '''
    correct_num, pred_num, gold_num = 0, 0, 0

    with open(compare_file, 'r', encoding="utf-8") as file:
        text = file.read()
        text = json.loads(text)
        df = pd.json_normalize(text)

    for index, row in df.iterrows():
        spo_list = row['spo_list']
        spo_list = drop_spo_duplicates(spo_list)
        spo_list_pred = row['spo_list_pred']

        if match_pattern == "only_head_text":
            spo_list = [[spo[0].split(" ")[0], spo[1], spo[2].split(" ")[0]] for spo in spo_list]
            spo_list_pred = [[spo[0].split(" ")[0], spo[1], spo[2].split(" ")[0]] for spo in spo_list_pred]
        elif match_pattern != "whole_text":
            raise ValueError("match_pattern error: not in ['whole_text', 'only_head_text']")

        for rel_str in spo_list_pred:
            if rel_str in spo_list:
                correct_num += 1

        pred_num += len(spo_list_pred)
        gold_num += len(spo_list)

    minimini = 1e-10
    precision = correct_num / (pred_num + minimini)
    recall = correct_num / (gold_num + minimini)
    f1 = 2 * precision * recall / (precision + recall + minimini)

    return precision, recall, f1


def get_model_size(model, framework):
    if framework == "paddlepaddle":
        import numpy as np

        # 定义总参数量、可训练参数量及非可训练参数量变量
        Total_params = 0

        # 获取参数情况
        for p in model.parameters():
            mulValue = np.prod(p.shape)  # 使用numpy prod接口计算数组所有元素之积
            Total_params += mulValue  # 总参数量

        return f'Total params: {Total_params / 1024 / 1024}'
    elif framework == "pytorch":
        param_size = 0
        param_sum = 0
        for param in model.parameters():
            param_size += param.nelement() * param.element_size()
            param_sum += param.nelement()
        buffer_size = 0
        buffer_sum = 0
        for buffer in model.buffers():
            buffer_size += buffer.nelement() * buffer.element_size()
            buffer_sum += buffer.nelement()
        # all_size = (param_size + buffer_size) / 1024 / 1024
        # print((param_size, param_sum , buffer_size, buffer_sum, all_size))
        return f'Total params: {param_sum / 1024 / 1024}'
    elif framework == "tensorflow":
        import numpy as np
        import tensorflow as tf

        para_num = sum([np.prod(var.get_shape().as_list()) for var in tf.trainable_variables()])
        # para_size: 参数个数 * 每个4字节(float32) / 1024 / 1024，单位为 MB
        para_size = para_num * 4 / 1024 / 1024
        return f'Total params: {para_size}'
    elif framework == "keras":
        import numpy as np

        para_num = sum([np.prod(w.shape) for w in model.get_weights()])
        # para_size: 参数个数 * 每个4字节(float32) / 1024 / 1024，单位为 MB
        para_size = para_num * 4 / 1024 / 1024
        return f'Total params: {para_size}'