|
- # coding=utf-8
- # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import argparse
- import copy
- import json
- import os
- import random
- import time
- from decimal import Decimal
-
- import numpy as np
- import paddle
- from utils import load_txt
-
- from paddlenlp.trainer.argparser import strtobool
- from paddlenlp.utils.log import logger
-
-
- def set_seed(seed):
- paddle.seed(seed)
- random.seed(seed)
- np.random.seed(seed)
-
-
- PROMPT_ITEMS = {
- "aspect_prompt_prefix": "评价维度",
- "opinion_prompt": "观点词",
- "sentiment_prompt_prefix": "情感倾向",
- "separator": "##",
- "not_mentioned_option": "未提及",
- "positive_option": "正向",
- "negative_option": "负向",
- }
-
-
- class Convertor(object):
- """Convertor to convert data export from annotation platform"""
-
- def __init__(self, negative_ratio=5):
- """Init Data Convertor"""
- self.negative_ratio = negative_ratio
- self.aspect_prompt_prefix = PROMPT_ITEMS["aspect_prompt_prefix"]
- self.opinion_prompt = PROMPT_ITEMS["opinion_prompt"]
- self.sentiment_prompt_prefix = PROMPT_ITEMS["sentiment_prompt_prefix"]
- self.separator = PROMPT_ITEMS["separator"]
- self.not_mentioned_option = PROMPT_ITEMS["not_mentioned_option"]
- self.options = PROMPT_ITEMS["options"]
-
- def process_text_tag(self, line, task_type="ext"):
- items = {}
- items["text"] = line["data"]["text"]
- if task_type == "ext":
- items["entities"] = []
- items["relations"] = []
- items["relation_ids"] = set()
- result_list = line["annotations"][0]["result"]
- for result in result_list:
- if result["type"] == "labels":
- items["entities"].append(
- {
- "id": result["id"],
- "start_offset": result["value"]["start"],
- "end_offset": result["value"]["end"],
- "label": result["value"]["labels"][0],
- }
- )
- else:
- items["relations"].append(
- {
- "id": result["from_id"] + "-" + result["to_id"],
- "from_id": result["from_id"],
- "to_id": result["to_id"],
- "type": result["labels"][0] if result["labels"] else self.opinion_prompt,
- }
- )
- items["relation_ids"].add(result["from_id"])
- items["relation_ids"].add(result["to_id"])
-
- elif task_type == "cls":
- items["label"] = line["annotations"][0]["result"][0]["value"]["choices"]
- return items
-
- def convert_cls_examples(self, raw_examples, data_flag="Data"):
- """
- Convert labeled data for classification task.
- """
- examples = []
- logger.info("{0:7} Start to convert annotation data.".format("[" + data_flag + "]"))
- for line in raw_examples:
- items = self.process_text_tag(line, task_type="cls")
- text, labels = items["text"], items["label"]
- example = self.generate_cls_example(text, labels, self.sentiment_prompt_prefix, self.options)
- examples.append(example)
- logger.info("{0:7} End to convert annotation data.\n".format(""))
- return examples
-
- def convert_ext_examples(
- self,
- raw_examples,
- synonyms=None,
- implicit_opinion_map=None,
- sentiment_map=None,
- with_negatives=True,
- task_type="ext_aso",
- data_flag="Data",
- ):
- """
- Convert labeled data for extraction task.
- """
-
- def _sep_cls_label(label, separator):
- label_list = label.split(separator)
- if len(label_list) == 1:
- return label_list[0], None
- return label_list[0], label_list[1:]
-
- texts = []
- # {"content": "", "result_list": [], "prompt": "X"}
- entity_examples = []
- # {"content": "", "result_list": [], "prompt": "X的Y"}
- relation_examples = []
- # {"content": "", "result_list": [], "prompt": "X的情感倾向[正向,负向]"}
- entity_cls_examples = []
-
- # entity label set: ["评价维度", "观点词", ... ]
- entity_label_set = []
- # predicate set: ["观点词", ... ]
- predicate_set = []
- # set of subject entity in relation: ["房间", "价格", ... ]
- subject_name_set = []
-
- # List[List[str]]
- # List of entity prompt for each example
- entity_prompt_list = []
- # Golden subject label for each example
- subject_golden_list = []
- # List of inverse relation for each example
- inverse_relation_list = []
- # List of predicate for each example
- predicate_list = []
-
- logger.info("{0:7} Start to convert annotation data.".format("[" + data_flag + "]"))
- logger.info("{0:7} Trying to generate positive examples...".format(""))
- for line in raw_examples:
- items = self.process_text_tag(line, task_type="ext")
-
- text, relations, entities, relation_ids = (
- items["text"],
- items["relations"],
- items["entities"],
- items["relation_ids"],
- )
- texts.append(text)
-
- entity_example = []
- entity_prompt = []
- entity_example_map = {}
- implict_example_map = {}
- entity_map = {}
- subject_golden = []
- for entity in entities:
- entity_name = text[entity["start_offset"] : entity["end_offset"]]
- entity_map[entity["id"]] = {
- "name": entity_name,
- "start": entity["start_offset"],
- "end": entity["end_offset"],
- }
-
- entity_label, entity_cls_label = _sep_cls_label(entity["label"], self.separator)
-
- # generate examples for entity-level sentiment classification
- if entity_cls_label is not None:
- entity_cls_prompt_prefix = entity_name + "的" + self.sentiment_prompt_prefix
- entity_cls_example = self.generate_cls_example(
- text, entity_cls_label, entity_cls_prompt_prefix, self.options
- )
-
- entity_cls_examples.append(entity_cls_example)
-
- # generate examples for entity extraction
- result = {"text": entity_name, "start": entity["start_offset"], "end": entity["end_offset"]}
- if entity_label not in entity_example_map.keys():
- entity_example_map[entity_label] = {
- "content": text,
- "result_list": [result],
- "prompt": entity_label,
- }
- else:
- entity_example_map[entity_label]["result_list"].append(result)
-
- if entity_label not in entity_label_set:
- entity_label_set.append(entity_label)
- entity_prompt.append(entity_label)
-
- if implicit_opinion_map and entity["id"] not in relation_ids:
- maped_entity = entity_map[entity["id"]]
- if maped_entity["name"] not in implicit_opinion_map:
- continue
-
- result = {
- "text": maped_entity["name"],
- "start": maped_entity["start"],
- "end": maped_entity["end"],
- }
- aspect = implicit_opinion_map[maped_entity["name"]]
- if aspect not in implict_example_map:
- implict_example_map[aspect] = [result]
- else:
- implict_example_map[aspect].append(result)
-
- if entity_label.startswith(self.aspect_prompt_prefix):
- if entity_name not in subject_golden:
- if synonyms and entity_name in synonyms:
- subject_synonyms = synonyms[entity_name]
- subject_golden.extend(subject_synonyms)
- else:
- subject_golden.append(entity_name)
-
- if entity_name not in subject_name_set:
- subject_name_set.append(entity_name)
-
- for v in entity_example_map.values():
- entity_example.append(v)
- entity_examples.append(entity_example)
- entity_prompt_list.append(entity_prompt)
-
- # generate examples for classification of implicit opinion
- if task_type == "ext_as" or task_type == "ext_aso":
- for entity_name in implict_example_map.keys():
- prompt = entity_name + "的" + self.sentiment_prompt_prefix
- opinions = implict_example_map[entity_name]
- sentiment = None
- for opinion in opinions:
- if opinion["text"] in sentiment_map:
- sentiment = sentiment_map[opinion["text"]]
- break
- if sentiment is None:
- continue
- implicit_example = self.generate_cls_example(text, [sentiment], prompt, self.options)
- entity_cls_examples.append(implicit_example)
-
- # generate examples for relation extraction
- # Golden entity inputs, initializing with implicit subject and it's synonyms
- for implicit_subject in implict_example_map.keys():
- subject_golden.append(implicit_subject)
- if synonyms and implicit_subject in synonyms:
- subject_golden.extend(synonyms[implicit_subject])
- relation_example = []
- relation_example_map = {}
- inverse_relation = []
- predicates = []
-
- # generate examples for extraction of implicit opinion
- for entity_name in implict_example_map.keys():
- prompt = entity_name + "的" + self.opinion_prompt
- implicit_example = {
- "content": text,
- "result_list": implict_example_map[entity_name],
- "prompt": prompt,
- }
- relation_example.append(implicit_example)
-
- # generate examples for labeled relations
- for relation in relations:
- predicate = relation["type"]
- subject_id = relation["from_id"]
- object_id = relation["to_id"]
-
- prompt = entity_map[subject_id]["name"] + "的" + predicate
- inverse_negative = entity_map[object_id]["name"] + "的" + predicate
-
- result = {
- "text": entity_map[object_id]["name"],
- "start": entity_map[object_id]["start"],
- "end": entity_map[object_id]["end"],
- }
-
- inverse_relation.append(inverse_negative)
- predicates.append(predicate)
-
- if prompt not in relation_example_map.keys():
- relation_example_map[prompt] = {"content": text, "result_list": [result], "prompt": prompt}
- else:
- relation_example_map[prompt]["result_list"].append(result)
-
- if predicate not in predicate_set:
- predicate_set.append(predicate)
-
- for v in relation_example_map.values():
- relation_example.append(v)
-
- relation_examples.append(relation_example)
- subject_golden_list.append(subject_golden)
- inverse_relation_list.append(inverse_relation)
- predicate_list.append(predicates)
-
- # start to generate negative examples
- if with_negatives and task_type in ["ext_as", "ext_ao", "ext_aso"]:
- logger.info("{0:7} Trying to generate negative examples...".format(""))
-
- # generate negative examples according to entity
- all_entity_examples = []
- if with_negatives:
- positive_examples, negative_examples = self.add_entity_negative_example(
- entity_examples, texts, entity_prompt_list, entity_label_set
- )
- if len(positive_examples) != 0:
- all_entity_examples = positive_examples + negative_examples
- else:
- for i in range(len(entity_examples)):
- all_entity_examples.extend(entity_examples[i])
-
- # generate negative examples according to relation
- all_relation_examples = []
- if with_negatives:
- if len(predicate_set) != 0:
- positive_examples = []
- negative_examples = []
- per_n_ratio = self.negative_ratio // 3
-
- for i, text in enumerate(texts):
- negative_example = []
- collects = []
-
- # 1. inverse_relation_list
- redundants1 = inverse_relation_list[i]
-
- # 2. subject_name_set - subject_golden_list[i]
- redundants2 = []
- if len(predicate_list[i]) != 0:
- nonentity_list = list(set(subject_name_set) - set(subject_golden_list[i]))
- nonentity_list.sort()
-
- redundants2 = [
- nonentity + "的" + predicate_list[i][random.randrange(len(predicate_list[i]))]
- for nonentity in nonentity_list
- ]
-
- # 3. entity_label_set - entity_prompt_list[i]
- redundants3 = []
- if len(subject_golden_list[i]) != 0:
- non_ent_label_list = list(set(entity_label_set) - set(entity_prompt_list[i]))
- non_ent_label_list.sort()
-
- redundants3 = [
- subject_golden_list[i][random.randrange(len(subject_golden_list[i]))] + "的" + non_ent_label
- for non_ent_label in non_ent_label_list
- ]
-
- redundants_list = [redundants1, redundants2, redundants3]
-
- for redundants in redundants_list:
- added, rest = self.add_relation_negative_example(redundants, texts[i], per_n_ratio)
- negative_example.extend(added)
- collects.extend(rest)
- num_sup = self.negative_ratio - len(negative_example)
- if num_sup > 0 and collects:
- if num_sup > len(collects):
- idxs = [k for k in range(len(collects))]
- else:
- idxs = random.sample(range(0, len(collects)), num_sup)
- for idx in idxs:
- negative_example.append(collects[idx])
- positive_examples.extend(relation_examples[i])
- negative_examples.extend(negative_example)
-
- all_relation_examples = positive_examples + negative_examples
- else:
- for i in range(len(relation_examples)):
- all_relation_examples.extend(relation_examples[i])
-
- # generate negative examples according to sentiment polarity
- all_cls_examples = entity_cls_examples
- if with_negatives:
- if task_type == "ext_aso" or task_type == "ext_as" and self.not_mentioned_option in self.options:
- cls_negatives_examples = self.add_cls_negative_example(texts, subject_name_set, subject_golden_list)
- all_cls_examples += cls_negatives_examples
-
- # generate examples with synonyms to support aspect aggregation
- if synonyms is not None:
- synonym_map = {}
- for k, vs in synonyms.items():
- for v in vs:
- synonym_map[v] = k
- relation_synonym_examples = self.change_aspect_with_synonyms(all_relation_examples, synonyms, synonym_map)
- all_relation_examples += relation_synonym_examples
- cls_synonym_examples = self.change_aspect_with_synonyms(all_cls_examples, synonyms, synonym_map)
- all_cls_examples += cls_synonym_examples
-
- logger.info("{0:7} End to convert annotation data.\n".format(""))
- return all_entity_examples + all_relation_examples + all_cls_examples
-
- def change_aspect_with_synonyms(self, examples, synonyms, synonym_map):
- synonym_examples = []
- for example in examples:
- prompt = example["prompt"]
- aspect, suffix = prompt.split("的", maxsplit=1)
- if aspect not in synonym_map.keys():
- continue
- synonym_cluster = synonyms[synonym_map[aspect]]
- for syn_aspect in synonym_cluster:
- if syn_aspect == aspect:
- continue
- syn_prompt = syn_aspect + "的" + suffix
- syn_example = copy.deepcopy(example)
- syn_example["prompt"] = syn_prompt
- synonym_examples.append(syn_example)
- return synonym_examples
-
- def generate_cls_example(self, text, labels, prompt_prefix, options):
- random.shuffle(self.options)
- cls_options = ",".join(self.options)
- prompt = prompt_prefix + "[" + cls_options + "]"
-
- result_list = []
- example = {"content": text, "result_list": result_list, "prompt": prompt}
-
- for label in labels:
- start = prompt.rfind(label) - len(prompt) - 1
- end = start + len(label)
- result = {"text": label, "start": start, "end": end}
- example["result_list"].append(result)
- return example
-
- def add_entity_negative_example(self, examples, texts, prompts, label_set):
- negative_examples = []
- positive_examples = []
- for i, prompt in enumerate(prompts):
- redundants = list(set(label_set) - set(prompt))
- redundants.sort()
-
- ratio = self.negative_ratio
- if ratio > len(redundants):
- ratio = len(redundants)
- idxs = random.sample(range(0, len(redundants)), ratio)
-
- for idx in idxs:
- negative_result = {"content": texts[i], "result_list": [], "prompt": redundants[idx]}
- negative_examples.append(negative_result)
- positive_examples.extend(examples[i])
- return positive_examples, negative_examples
-
- def add_relation_negative_example(self, redundants, text, ratio):
- added_example = []
- rest_example = []
-
- if ratio > len(redundants):
- ratio = len(redundants)
-
- all_idxs = [k for k in range(len(redundants))]
- idxs = random.sample(range(0, len(redundants)), ratio)
- rest_idxs = list(set(all_idxs) - set(idxs))
-
- for idx in idxs:
- negative_result = {"content": text, "result_list": [], "prompt": redundants[idx]}
- added_example.append(negative_result)
-
- for rest_idx in rest_idxs:
- negative_result = {"content": text, "result_list": [], "prompt": redundants[rest_idx]}
- rest_example.append(negative_result)
-
- return added_example, rest_example
-
- def add_cls_negative_example(self, texts, subject_name_set, subject_golden_list):
- negative_examples = []
- for i, text in enumerate(texts):
- redundants = list(set(subject_name_set) - set(subject_golden_list[i]))
- redundants.sort()
-
- ratio = self.negative_ratio
- if ratio > len(redundants):
- ratio = len(redundants)
- idxs = random.sample(range(0, len(redundants)), ratio)
-
- for idx in idxs:
- subject_name = redundants[idx]
- prompt_prefix = subject_name + "的" + self.sentiment_prompt_prefix
- negative_example = self.generate_cls_example(text, ["未提及"], prompt_prefix, self.options)
- negative_examples.append(negative_example)
- return negative_examples
-
-
- def load_synonym(synonym_path):
- synonyms = {}
- lines = load_txt(synonym_path)
- for line in lines:
- items = line.split()
- synonyms[items[0]] = items
- return synonyms
-
-
- def load_implicit_opinion(implicit_opinion_path):
- implicit_opinion_map = {}
- sentiment_map = {}
- lines = load_txt(implicit_opinion_path)
- for line in lines:
- items = line.split(",")
- aspect = items[0].strip()
- for item in items[1:]:
- item = item.strip()
- start = item.find("[")
- end = item.find("]")
- sentiment = item[0:start]
- opinions = item[start + 1 : end].strip().split()
- for opinion in opinions:
- implicit_opinion_map[opinion] = aspect
- sentiment_map[opinion] = sentiment
- return implicit_opinion_map, sentiment_map
-
-
- def parse_ext_task_type(raw_examples):
-
- task_type_dict = {"ext_a": False, "ext_o": False, "ext_ao": False, "ext_as": False, "ext_aso": False}
-
- def _parse_raw_example(raw_example):
- entity_map = {}
- relations = []
- result_list = raw_example["annotations"][0]["result"]
- for result in result_list:
- if result["type"] == "labels":
- entity_id = result["id"]
- entity_map[entity_id] = result["value"]["labels"][0]
- elif result["type"] == "relation":
- relation_pair = (result["from_id"], result["to_id"])
- relations.append(relation_pair)
- else:
- raise ValueError(
- "Unknown entity type [{}], it indicates that your dataset maybe not a aspect-based extraction dataset, please check it.".format(
- result["type"]
- )
- )
-
- for entity_label in entity_map.values():
- if (
- entity_label.startswith(PROMPT_ITEMS["aspect_prompt_prefix"])
- and PROMPT_ITEMS["separator"] in entity_label
- ):
- task_type_dict["ext_as"] = True
- elif entity_label == PROMPT_ITEMS["aspect_prompt_prefix"]:
- task_type_dict["ext_a"] = True
- elif entity_label == PROMPT_ITEMS["opinion_prompt"]:
- task_type_dict["ext_o"] = True
- else:
- raise ValueError("Unknown prompt: {}".format(entity_label))
-
- # relations store the relation between aspect and opinion by default
- if relations:
- task_type_dict["ext_ao"] = True
-
- if task_type_dict["ext_ao"] and task_type_dict["ext_as"]:
- task_type_dict["ext_aso"] = True
-
- for raw_example in raw_examples:
- # analyze task type
- _parse_raw_example(raw_example)
- if task_type_dict["ext_aso"]:
- return "ext_aso"
- elif (not task_type_dict["ext_as"]) and task_type_dict["ext_ao"]:
- return "ext_ao"
-
- if task_type_dict["ext_as"]:
- return "ext_as"
- elif task_type_dict["ext_o"]:
- return "ext_o"
- else:
- return "ext_a"
-
-
- def do_convert():
- set_seed(args.seed)
-
- tic_time = time.time()
- if not os.path.exists(args.label_studio_file):
- raise ValueError("Please input the correct path of label studio file.")
-
- if not os.path.exists(args.save_dir):
- os.makedirs(args.save_dir)
-
- if len(args.splits) != 0 and len(args.splits) != 3:
- raise ValueError("Only []/ len(splits)==3 accepted for splits.")
-
- def _check_sum(splits):
- return Decimal(str(splits[0])) + Decimal(str(splits[1])) + Decimal(str(splits[2])) == Decimal("1")
-
- if len(args.splits) == 3 and not _check_sum(args.splits):
- raise ValueError("Please set correct splits, sum of elements in splits should be equal to 1.")
-
- with open(args.label_studio_file, "r", encoding="utf-8") as f:
- raw_examples = json.loads(f.read())
-
- if args.is_shuffle:
- indexes = np.random.permutation(len(raw_examples))
- raw_examples = [raw_examples[i] for i in indexes]
-
- # construct options according
- if args.options:
- PROMPT_ITEMS["options"] = args.options
- else:
- if args.task_type == "ext":
- PROMPT_ITEMS["options"] = [
- PROMPT_ITEMS["positive_option"],
- PROMPT_ITEMS["negative_option"],
- PROMPT_ITEMS["not_mentioned_option"],
- ]
- else:
- PROMPT_ITEMS["options"] = [PROMPT_ITEMS["positive_option"], PROMPT_ITEMS["negative_option"]]
-
- # analyze detailed ext task type: ext_a, ext_o, ext_as, ext_ao, ext_aso
- if args.task_type == "ext":
- args.task_type = parse_ext_task_type(raw_examples)
-
- logger.info("You are trying perform dataset construction operation for task {}.\n".format(args.task_type))
-
- # load synonyms
- synonyms = None
- if args.synonym_file:
- if args.task_type in ["cls", "ext_a", "ext_o"]:
- logger.warning(
- "The param synonym_file will not work for task, because the task {} that you wanna try does not support synonym_function.".format(
- args.task_type
- )
- )
- else:
- if not os.path.isfile(args.synonym_file):
- raise ValueError(
- "The path you input is not a file, please input the correct path of synonym file: {}".format(
- args.synonym_file
- )
- )
- synonyms = load_synonym(args.synonym_file)
-
- # load implicit opinions
- implicit_opinion_map = None
- sentiment_map = None
- if args.implicit_file:
- if args.task_type in ["cls", "ext_a", "ext_o", "ext_as"]:
- logger.warning(
- "The param implicit_file will not work for task, because the task {} that you wanna try does not support implicit opinion function.".format(
- args.task_type
- )
- )
- else:
- if not os.path.isfile(args.implicit_file):
- raise ValueError(
- "The path you input is not a file, please input the correct path of implicit opinion file: {}".format(
- args.implicit_file
- )
- )
- implicit_opinion_map, sentiment_map = load_implicit_opinion(args.implicit_file)
-
- # split examples into train/dev/test examples
- i1, i2, _ = args.splits
- p1 = int(len(raw_examples) * i1)
- p2 = int(len(raw_examples) * (i1 + i2))
-
- # define Convertor and convert raw examples to model examples
- convertor = Convertor(negative_ratio=args.negative_ratio)
-
- if args.task_type.startswith("ext"):
- train_examples = convertor.convert_ext_examples(
- raw_examples[:p1],
- synonyms=synonyms,
- implicit_opinion_map=implicit_opinion_map,
- sentiment_map=sentiment_map,
- task_type=args.task_type,
- data_flag="Train",
- )
- dev_examples = convertor.convert_ext_examples(
- raw_examples[p1:p2],
- synonyms=synonyms,
- implicit_opinion_map=implicit_opinion_map,
- sentiment_map=sentiment_map,
- task_type=args.task_type,
- data_flag="Dev",
- )
- test_examples = convertor.convert_ext_examples(
- raw_examples[p2:],
- synonyms=synonyms,
- implicit_opinion_map=implicit_opinion_map,
- sentiment_map=sentiment_map,
- task_type=args.task_type,
- data_flag="Test",
- )
- else:
- train_examples = convertor.convert_cls_examples(raw_examples[:p1], data_flag="Train")
- dev_examples = convertor.convert_cls_examples(raw_examples[p1:p2], data_flag="Dev")
- test_examples = convertor.convert_cls_examples(raw_examples[p2:], data_flag="Test")
-
- # save examples
- def _save_examples(save_dir, file_name, examples):
- count = 0
- save_path = os.path.join(save_dir, file_name)
- with open(save_path, "w", encoding="utf-8") as f:
- for example in examples:
- f.write(json.dumps(example, ensure_ascii=False) + "\n")
- count += 1
- logger.info("Save %d examples to %s." % (count, save_path))
-
- _save_examples(args.save_dir, "train.json", train_examples)
- _save_examples(args.save_dir, "dev.json", dev_examples)
- _save_examples(args.save_dir, "test.json", test_examples)
-
- logger.info("Finished! It takes {:.2f} seconds".format(time.time() - tic_time))
-
-
- if __name__ == "__main__":
- # yapf: disable
- parser = argparse.ArgumentParser()
-
- parser.add_argument("--label_studio_file", default="./data/label_studio.json", type=str, help="The annotation file exported from label studio platform.")
- parser.add_argument("--synonym_file", type=str, help="The synonmy file of aspect to support aspect aggregation.")
- parser.add_argument("--implicit_file", type=str, help="The implicit opinion file whose aspect not be mentioned in text, to support extraction of implicit opinion.")
- parser.add_argument("--save_dir", default="./data", type=str, help="The path of data that you wanna save.")
- parser.add_argument("--negative_ratio", default=5, type=int, help="Worked only for the extraction task, it means that for each task (aspect-based opinion extraction, aspect-based sentiment classicition) of an example, at least negative_ratio negative examples will be generated without considering synonym_file and implicit_file.")
- parser.add_argument("--splits", default=[0.8, 0.1, 0.1], type=float, nargs="*", help="The ratio of samples in datasets. [0.6, 0.2, 0.2] means 60% samples used for training, 20% for evaluation and 20% for test.")
- parser.add_argument("--task_type", choices=['ext', 'cls'], default="ext", type=str, help="Two task types [ext, cls] are supported, ext represents the aspect-based extraction task and cls represents the sentence-level classification task, defaults to ext.")
- parser.add_argument("--options", type=str, nargs="+", help="Used only for the classification task, the options for classification")
- parser.add_argument("--is_shuffle", type=strtobool, default="True", help="Whether to shuffle the labeled dataset, defaults to True.")
- parser.add_argument("--seed", type=int, default=1000, help="Random seed for initialization")
-
- args = parser.parse_args()
- # yapf: enablecl
- logger.info("Parameter Description:\n{}\n".format(args.__dict__))
-
- do_convert()
|