|
- # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
-
- import os
- from dataclasses import dataclass, field
- from functools import partial
- from typing import List, Optional
-
- import paddle
- from utils import convert_example, reader
-
- from paddlenlp.data import DataCollatorWithPadding
- from paddlenlp.datasets import load_dataset
- from paddlenlp.metrics import SpanEvaluator
- from paddlenlp.trainer import (
- CompressionArguments,
- PdArgumentParser,
- Trainer,
- get_last_checkpoint,
- )
- from paddlenlp.transformers import UIE, UIEM, AutoTokenizer, export_model
- from paddlenlp.utils.log import logger
-
-
- @dataclass
- class DataArguments:
- """
- Arguments pertaining to what data we are going to input our model for training and eval.
- Using `PdArgumentParser` we can turn this class into argparse arguments to be able to
- specify them on the command line.
- """
-
- train_path: str = field(
- default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
- )
-
- dev_path: str = field(
- default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
- )
-
- max_seq_length: Optional[int] = field(
- default=512,
- metadata={
- "help": "The maximum total input sequence length after tokenization. Sequences longer "
- "than this will be truncated, sequences shorter will be padded."
- },
- )
-
- dynamic_max_length: Optional[List[int]] = field(
- default=None,
- metadata={"help": "dynamic max length from batch, it can be array of length, eg: 16 32 64 128"},
- )
-
-
- @dataclass
- class ModelArguments:
- """
- Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
- """
-
- model_name_or_path: Optional[str] = field(
- default="uie-base",
- metadata={
- "help": "Path to pretrained model, such as 'uie-base', 'uie-tiny', "
- "'uie-medium', 'uie-mini', 'uie-micro', 'uie-nano', 'uie-base-en', "
- "'uie-m-base', 'uie-m-large', or finetuned model path."
- },
- )
- export_model_dir: Optional[str] = field(
- default=None,
- metadata={"help": "Path to directory to store the exported inference model."},
- )
- multilingual: bool = field(default=False, metadata={"help": "Whether the model is a multilingual model."})
-
-
- def main():
- parser = PdArgumentParser((ModelArguments, DataArguments, CompressionArguments))
- model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
- if model_args.model_name_or_path in ["uie-m-base", "uie-m-large"]:
- model_args.multilingual = True
-
- # Log model and data config
- training_args.print_config(model_args, "Model")
- training_args.print_config(data_args, "Data")
-
- paddle.set_device(training_args.device)
-
- # Log on each process the small summary:
- logger.warning(
- f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
- + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
- )
-
- # Detecting last checkpoint.
- last_checkpoint = None
- if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
- last_checkpoint = get_last_checkpoint(training_args.output_dir)
- if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
- raise ValueError(
- f"Output directory ({training_args.output_dir}) already exists and is not empty. "
- "Use --overwrite_output_dir to overcome."
- )
- elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
- logger.info(
- f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
- "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
- )
-
- tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
- if model_args.multilingual:
- model = UIEM.from_pretrained(model_args.model_name_or_path)
- else:
- model = UIE.from_pretrained(model_args.model_name_or_path)
-
- train_ds = load_dataset(reader, data_path=data_args.train_path, max_seq_len=data_args.max_seq_length, lazy=False)
- dev_ds = load_dataset(reader, data_path=data_args.dev_path, max_seq_len=data_args.max_seq_length, lazy=False)
-
- trans_fn = partial(
- convert_example,
- tokenizer=tokenizer,
- max_seq_len=data_args.max_seq_length,
- multilingual=model_args.multilingual,
- dynamic_max_length=data_args.dynamic_max_length,
- )
-
- train_ds = train_ds.map(trans_fn)
- dev_ds = dev_ds.map(trans_fn)
-
- if training_args.device == "npu":
- data_collator = DataCollatorWithPadding(tokenizer, padding="longest")
- else:
- data_collator = DataCollatorWithPadding(tokenizer)
-
- criterion = paddle.nn.BCELoss()
-
- def uie_loss_func(outputs, labels):
- start_ids, end_ids = labels
- start_prob, end_prob = outputs
- start_ids = paddle.cast(start_ids, "float32")
- end_ids = paddle.cast(end_ids, "float32")
- loss_start = criterion(start_prob, start_ids)
- loss_end = criterion(end_prob, end_ids)
- loss = (loss_start + loss_end) / 2.0
- return loss
-
- def compute_metrics(p):
- metric = SpanEvaluator()
- start_prob, end_prob = p.predictions
- start_ids, end_ids = p.label_ids
- metric.reset()
-
- num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
- metric.update(num_correct, num_infer, num_label)
- precision, recall, f1 = metric.accumulate()
- metric.reset()
-
- return {"precision": precision, "recall": recall, "f1": f1}
-
- trainer = Trainer(
- model=model,
- criterion=uie_loss_func,
- args=training_args,
- data_collator=data_collator,
- train_dataset=train_ds if training_args.do_train or training_args.do_compress else None,
- eval_dataset=dev_ds if training_args.do_eval or training_args.do_compress else None,
- tokenizer=tokenizer,
- compute_metrics=compute_metrics,
- )
-
- trainer.optimizer = paddle.optimizer.AdamW(
- learning_rate=training_args.learning_rate, parameters=model.parameters()
- )
- checkpoint = None
- if training_args.resume_from_checkpoint is not None:
- checkpoint = training_args.resume_from_checkpoint
- elif last_checkpoint is not None:
- checkpoint = last_checkpoint
-
- # Training
- if training_args.do_train:
- train_result = trainer.train(resume_from_checkpoint=checkpoint)
- metrics = train_result.metrics
- trainer.save_model()
- trainer.log_metrics("train", metrics)
- trainer.save_metrics("train", metrics)
- trainer.save_state()
-
- # Evaluate and tests model
- if training_args.do_eval:
- eval_metrics = trainer.evaluate()
- trainer.log_metrics("eval", eval_metrics)
-
- # export inference model
- if training_args.do_export:
- # You can also load from certain checkpoint
- # trainer.load_state_dict_from_checkpoint("/path/to/checkpoint/")
- if training_args.device == "npu":
- # npu will transform int64 to int32 for internal calculation.
- # To reduce useless transformation, we feed int32 inputs.
- input_spec_dtype = "int32"
- else:
- input_spec_dtype = "int64"
- if model_args.multilingual:
- input_spec = [
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="input_ids"),
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="position_ids"),
- ]
- else:
- input_spec = [
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="input_ids"),
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="token_type_ids"),
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="position_ids"),
- paddle.static.InputSpec(shape=[None, None], dtype=input_spec_dtype, name="attention_mask"),
- ]
- if model_args.export_model_dir is None:
- model_args.export_model_dir = os.path.join(training_args.output_dir, "export")
- export_model(model=trainer.model, input_spec=input_spec, path=model_args.export_model_dir)
- if training_args.do_compress:
-
- @paddle.no_grad()
- def custom_evaluate(self, model, data_loader):
- metric = SpanEvaluator()
- model.eval()
- metric.reset()
- for batch in data_loader:
- if model_args.multilingual:
- logits = model(input_ids=batch["input_ids"], position_ids=batch["position_ids"])
- else:
- logits = model(
- input_ids=batch["input_ids"],
- token_type_ids=batch["token_type_ids"],
- position_ids=batch["position_ids"],
- attention_mask=batch["attention_mask"],
- )
- start_prob, end_prob = logits
- start_ids, end_ids = batch["start_positions"], batch["end_positions"]
- num_correct, num_infer, num_label = metric.compute(start_prob, end_prob, start_ids, end_ids)
- metric.update(num_correct, num_infer, num_label)
- precision, recall, f1 = metric.accumulate()
- logger.info("f1: %s, precision: %s, recall: %s" % (f1, precision, f1))
- model.train()
- return f1
-
- trainer.compress(custom_evaluate=custom_evaluate)
-
-
- if __name__ == "__main__":
- main()
|