4paradigm
/
AutoX

 
			
							import lightgbm as lgb
import numpy as np
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')


class FeatureOne2Many:
    """**Create features from table with one to many relationship.**
    1. Pass the label of the primary table to the secondary table;
    2. Train model (5-fold cross validation) and predict in the secondary table;
    3. Aggregate the prediction results of the secondary table to obtain statistical information.

    Example::
        `feature_one2many_autox <https://www.kaggle.com/code/poteman/feature-one2many-autox>`_

    """
    def __init__(self):
        pass

    def fit(self, t1, t2, on, target):
        """
        :param t1: dataframe, primary table
        :param t2: dataframe, secondary table
        :param on: list, column names to join on
        :param target: str, target column name
        """
        self.on = on
        self.target = target

        t2 = t2.merge(t1[on + [target]], on=on, how='left')
        used = [x for x in t2.columns if x not in on + [target]]

        cat_cols = []
        for f in t2.columns:
            if 'O' == t2[f].dtype and f not in on + [target]:
                lbl = LabelEncoder()
                t2[f] = lbl.fit_transform(list(t2[f].astype(str)))
                cat_cols.append(f)

        lr = 0.1
        Early_Stopping_Rounds = 150
        N_round = 200
        Verbose = False
        params = {'num_leaves': 41,
                  'min_child_weight': 0.03454472573214212,
                  'feature_fraction': 0.3797454081646243,
                  'bagging_fraction': 0.4181193142567742,
                  'min_data_in_leaf': 96,
                  'objective': 'binary',
                  'max_depth': -1,
                  'learning_rate': lr,
                  "boosting_type": "gbdt",
                  "bagging_seed": 11,
                  "metric": 'auc',
                  "verbosity": -1,
                  'reg_alpha': 0.3899927210061127,
                  'reg_lambda': 0.6485237330340494,
                  'random_state': 47,
                  'num_threads': 16
                  }

        t2['one2many_predict'] = -1
        groups = np.array(t2[on])
        gss = GroupShuffleSplit(n_splits=5, random_state=42)
        for train_idx, test_idx in tqdm(gss.split(t2[used], t2[target], groups), total=5):
            train_idx = list(t2.loc[train_idx].loc[t2.loc[train_idx][target].notnull()].index)
            trn_data = lgb.Dataset(t2.loc[train_idx][used], label=t2.loc[train_idx][target],
                                   categorical_feature=cat_cols)
            clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data], verbose_eval=Verbose,
                            early_stopping_rounds=Early_Stopping_Rounds)

            t2.loc[test_idx, 'one2many_predict'] = clf.predict(t2.loc[test_idx][used])

        result = t2.groupby(on).agg(
            {'one2many_predict': ['max', 'min', 'median', 'mean', 'std', 'count']}).reset_index()
        result.columns = ['_'.join(x) if x[1] != '' else ''.join(x) for x in list(result.columns)]

        self.result = result

    def transform(self, df):
        """
        :param df: dataframe, primary table that requires feature construction
        :return: dataframe, created features
        """
        return df[self.on].merge(self.result, on=self.on, how='left')