|
- from tqdm import tqdm
- from autox.autox_competition.process_data import Feature_type_recognition
- from autox.autox_competition.CONST import FEATURE_TYPE
-
- class FeatureCount:
- """**Convert categorical features into the number of occurrences.**
- """
- def __init__(self):
- self.target = None
- self.df_feature_type = None
- self.silence_cols = []
- self.select_all = None
- self.max_num = None
- self.ops = []
-
- def fit(self, df, degree=1, target = None, df_feature_type = None, silence_cols = [], select_all = True,
- max_num = None):
- """
- :param df: dataframe, train_test.
- :param degree: int, degree equal to 1 or 2.
- :param target: str, target column.
- :param df_feature_type: dict, {col: type of col}.
- :param silence_cols:
- :param select_all:
- :param max_num:
- """
-
- assert(degree == 1 or degree == 2)
-
- self.target = target
- self.df_feature_type = df_feature_type
- self.silence_cols = silence_cols
- self.select_all = select_all
- self.max_num = max_num
-
- if self.df_feature_type is None:
- feature_type_recognition = Feature_type_recognition()
- feature_type = feature_type_recognition.fit(df)
- self.df_feature_type = feature_type
-
- for feature in self.df_feature_type.keys():
- if self.df_feature_type[feature] == FEATURE_TYPE['cat'] and feature not in self.silence_cols:
- self.ops.append([feature])
-
- if not self.select_all:
- if self.target is not None:
- # 训练模型,对group_col进行筛选
- pass
- else:
- # 通过统计信息进行筛选
- del_count_cols = []
- for count_col in self.ops:
- if df.drop_duplicates(count_col).shape[0] > df.shape[0] * 0.2:
- del_count_cols.append(count_col)
- for count_col in del_count_cols:
- self.ops.remove(count_col)
-
- if degree == 2:
- ops_degree_1 = self.ops
- ops = []
- for col_1 in ops_degree_1:
- for col_2 in ops_degree_1:
- if col_1 == col_2:
- continue
- else:
- ops.append(col_1 + col_2)
- self.ops = ops + ops_degree_1
-
- def get_ops(self):
- return self.ops
-
- def set_keys(self, ops):
- self.ops = ops
-
- def transform(self, df):
- """
- :param df: dataframe, train_test.
- :return: dataframe, count features.
- """
- name_list = []
- for op in tqdm(self.ops):
- if len(op) == 1:
- name = f'COUNT_{"__".join(op)}'
- name_list.append(name)
- df[name] = df.groupby(op)[op].transform('count')
- else:
- col_1, col_2 = op
- name = f'COUNT_{col_1}__{col_2}'
- name_list.append(name)
- df_map = df.groupby([col_1, col_2]).size().to_frame()
- df_map.columns = [name]
- df = df.merge(df_map, on=[col_1, col_2], how='left')
- result = df[name_list]
- df.drop(name_list, axis=1, inplace=True)
- return result
-
- def fit_transform(self, df, target = None, df_feature_type = None, silence_cols = [], select_all = True,
- max_num = None):
-
- self.fit(df, target=target, df_feature_type=df_feature_type, silence_cols=silence_cols,
- select_all=select_all, max_num=max_num)
- return self.transform(df)
|