OpenI
/
AISafety

 
			
							# !/usr/bin/env python
# coding=UTF-8
"""
@Author: WEN Hao
@LastEditors: WEN Hao
@Description:
@Date: 2021-09-08
@LastEditTime: 2022-04-15
"""
import os
from typing import Union, Optional, Sequence, NoReturn, Dict

import pandas as pd
from bs4 import BeautifulSoup as BS

from .base import NLPDataset
from utils.strings import LANGUAGE


__all__ = [
    "IMDBReviewsTiny",
]


class IMDBReviewsTiny(NLPDataset):
    """ """

    __name__ = "IMDBReviewsTiny"

    def __init__(
        self,
        subsets: Optional[Union[Sequence[str], str]] = None,
        remove_html: bool = True,
        max_len: Optional[int] = 512,
    ) -> NoReturn:
        """ """
        fp = os.path.dirname(os.path.abspath(__file__))
        fp = os.path.join(fp, "imdb_reviews_tiny", "imdb_reviews_tiny.csv.gz")
        tot_ds = pd.read_csv(fp, lineterminator="\n")
        if subsets is None:
            _subsets = [
                "train",
                "test",
            ]
        elif isinstance(subsets, str):
            _subsets = [subsets]
        else:
            _subsets = subsets
        tot_ds = tot_ds[tot_ds.set.isin(_subsets)].reset_index(drop=True)
        if remove_html:
            tot_ds["text"] = tot_ds["text"].apply(
                lambda s: BS(s, features="lxml").get_text()
            )

        super().__init__(
            dataset=[(row["text"], row["label"]) for _, row in tot_ds.iterrows()],
            input_columns=[
                "text",
            ],
            max_len=max_len,
        )
        self._name = self.__name__
        self._language = LANGUAGE.ENGLISH

    def get_word_freq(self, use_log: bool = False) -> Dict[str, float]:
        """ """
        fp = os.path.dirname(os.path.abspath(__file__))
        cache_fp = os.path.join(
            fp, "imdb_reviews_tiny", "imdb_reviews_tiny_word_freq.csv.gz"
        )
        return super().get_word_freq(use_log=use_log, cache_fp=cache_fp, parallel=True)