|
- # !/usr/bin/env python
- # coding=UTF-8
- """
- @Author: WEN Hao
- @LastEditors: WEN Hao
- @Description:
- @Date: 2021-09-23
- @LastEditTime: 2022-04-15
- """
- import os
- from typing import Union, Optional, Sequence, NoReturn, Dict
-
- import pandas as pd
- from bs4 import BeautifulSoup as BS
-
- from .base import NLPDataset
- from utils.strings import LANGUAGE
-
-
- __all__ = [
- "JDFullTiny",
- ]
-
-
- class JDFullTiny(NLPDataset):
- """ """
-
- __name__ = "JDFullTiny"
-
- def __init__(
- self,
- subsets: Optional[Union[Sequence[str], str]] = None,
- text_size: Optional[str] = None,
- use_title: bool = False,
- remove_html: bool = True,
- max_len: Optional[int] = 512,
- ) -> NoReturn:
- """ """
- fp = os.path.dirname(os.path.abspath(__file__))
- if text_size:
- assert text_size.lower() in [
- "long",
- "xl",
- ]
- self._text_size = f"_{text_size.lower()}"
- else:
- self._text_size = ""
- fp = os.path.join(
- fp, "jd_full_tiny", f"jd_full_filtered{self._text_size}_tiny.csv.gz"
- )
- tot_ds = pd.read_csv(fp, lineterminator="\n")
- if subsets is None:
- _subsets = [
- "train",
- "test",
- ]
- elif isinstance(subsets, str):
- _subsets = [subsets]
- else:
- _subsets = subsets
- tot_ds = tot_ds[tot_ds.set.isin(_subsets)].reset_index(drop=True)
-
- col = "title" if use_title else "content"
- if remove_html:
- tot_ds[col] = tot_ds[col].apply(lambda s: BS(s, features="lxml").get_text())
-
- super().__init__(
- dataset=[(row[col], row["score"]) for _, row in tot_ds.iterrows()],
- input_columns=[
- col,
- ],
- label_map={idx + 1: idx for idx in range(5)},
- max_len=max_len,
- )
- self._name = self.__name__
- self._language = LANGUAGE.CHINESE
-
- def get_word_freq(self, use_log: bool = False) -> Dict[str, float]:
- """ """
- fp = os.path.dirname(os.path.abspath(__file__))
- cache_fp = os.path.join(
- fp,
- "jd_full_tiny",
- f"jd_full_filtered{self._text_size}_tiny_word_freq.csv.gz",
- )
- return super().get_word_freq(use_log=use_log, cache_fp=cache_fp, parallel=False)
|