Browse Source

first commit

tags/v0.1.0
poteman 2 years ago
commit
4d75036cbf
60 changed files with 1235 additions and 0 deletions
  1. BIN
      .DS_Store
  2. +1
    -0
      .gitignore
  3. +2
    -0
      .idea/.gitignore
  4. +8
    -0
      .idea/automl_for_competition.iml
  5. +4
    -0
      .idea/encodings.xml
  6. +6
    -0
      .idea/inspectionProfiles/profiles_settings.xml
  7. +7
    -0
      .idea/misc.xml
  8. +8
    -0
      .idea/modules.xml
  9. +6
    -0
      .idea/vcs.xml
  10. +206
    -0
      README.md
  11. +7
    -0
      autox/CONST.py
  12. +1
    -0
      autox/__init__.py
  13. BIN
      autox/__pycache__/CONST.cpython-37.pyc
  14. BIN
      autox/__pycache__/__init__.cpython-37.pyc
  15. BIN
      autox/__pycache__/autox.cpython-37.pyc
  16. BIN
      autox/__pycache__/util.cpython-37.pyc
  17. +113
    -0
      autox/autox.py
  18. +1
    -0
      autox/ensemble/__init__.py
  19. BIN
      autox/ensemble/__pycache__/__init__.cpython-37.pyc
  20. BIN
      autox/ensemble/__pycache__/stacking.cpython-37.pyc
  21. +48
    -0
      autox/ensemble/stacking.py
  22. +2
    -0
      autox/feature_engineer/__init__.py
  23. BIN
      autox/feature_engineer/__pycache__/__init__.cpython-37.pyc
  24. BIN
      autox/feature_engineer/__pycache__/fe_count.cpython-37.pyc
  25. BIN
      autox/feature_engineer/__pycache__/fe_stat.cpython-37.pyc
  26. +86
    -0
      autox/feature_engineer/fe_count.py
  27. +82
    -0
      autox/feature_engineer/fe_stat.py
  28. +83
    -0
      autox/feature_engineer/fe_target_encoding.py
  29. +18
    -0
      autox/feature_selection/Feature_selction.py
  30. +0
    -0
      autox/feature_selection/__init__.py
  31. +1
    -0
      autox/file_io/__init__.py
  32. BIN
      autox/file_io/__pycache__/__init__.cpython-37.pyc
  33. BIN
      autox/file_io/__pycache__/read_data.cpython-37.pyc
  34. +17
    -0
      autox/file_io/read_data.py
  35. +0
    -0
      autox/join_tables/__init__.py
  36. +0
    -0
      autox/metrics/__init__.py
  37. +1
    -0
      autox/models/__init__.py
  38. BIN
      autox/models/__pycache__/__init__.cpython-37.pyc
  39. BIN
      autox/models/__pycache__/regressor.cpython-37.pyc
  40. +86
    -0
      autox/models/regressor.py
  41. +6
    -0
      autox/process_data/__init__.py
  42. BIN
      autox/process_data/__pycache__/__init__.cpython-37.pyc
  43. BIN
      autox/process_data/__pycache__/auto_label_encoder.cpython-37.pyc
  44. BIN
      autox/process_data/__pycache__/concat_train_test.cpython-37.pyc
  45. BIN
      autox/process_data/__pycache__/feature_combination.cpython-37.pyc
  46. BIN
      autox/process_data/__pycache__/feature_filter.cpython-37.pyc
  47. BIN
      autox/process_data/__pycache__/feature_type_recognition.cpython-37.pyc
  48. +29
    -0
      autox/process_data/auto_label_encoder.py
  49. +5
    -0
      autox/process_data/clip_label.py
  50. +0
    -0
      autox/process_data/concat_train_test.py
  51. +10
    -0
      autox/process_data/feature_combination.py
  52. +20
    -0
      autox/process_data/feature_filter.py
  53. +51
    -0
      autox/process_data/feature_type_recognition.py
  54. +5
    -0
      autox/process_data/train_test_divide.py
  55. +25
    -0
      autox/util.py
  56. +0
    -0
      demo/__init__.py
  57. +65
    -0
      run_oneclick.py
  58. +24
    -0
      setup.py
  59. +201
    -0
      sub/sub20210701.csv
  60. +0
    -0
      test/__init__.py

BIN
.DS_Store View File


+ 1
- 0
.gitignore View File

@@ -0,0 +1 @@
/data

+ 2
- 0
.idea/.gitignore View File

@@ -0,0 +1,2 @@
# Default ignored files
/workspace.xml

+ 8
- 0
.idea/automl_for_competition.iml View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.7 (2)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

+ 4
- 0
.idea/encodings.xml View File

@@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Encoding" addBOMForNewFiles="with NO BOM" />
</project>

+ 6
- 0
.idea/inspectionProfiles/profiles_settings.xml View File

@@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

+ 7
- 0
.idea/misc.xml View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (2)" project-jdk-type="Python SDK" />
</project>

+ 8
- 0
.idea/modules.xml View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/automl_for_competition.iml" filepath="$PROJECT_DIR$/.idea/automl_for_competition.iml" />
</modules>
</component>
</project>

+ 6
- 0
.idea/vcs.xml View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

+ 206
- 0
README.md View File

@@ -0,0 +1,206 @@
# AutoX是什么?
AutoX一个高效的自动化机器学习工具,它主要针对于表格类型的数据挖掘竞赛。
它的特点包括:
- 效果出色: AutoX在多个kaggle数据集上,效果显著优于其他解决方案(见[效果对比](#效果对比))。
- 简单易用: AutoX的接口和sklearn类似,方便上手使用。
- 通用: 适用于分类和回归问题。
- 自动化: 无需人工干预,全自动的数据清洗、特征工程、模型调参等步骤。
- 灵活性: 各组件解耦合,能单独使用,对于自动机器学习效果不满意的地方,可以结合专家知识,AutoX提供灵活的接口。
- 比赛上分点总结:整理并公开历史比赛的上分点。

# 目录
<!-- TOC -->

- [AutoX是什么?](#AutoX是什么?)
- [目录](#目录)
- [架构](#架构)
- [快速上手](#快速上手)
- [比赛上分点总结](#比赛上分点总结)
- [效果对比](#效果对比)

<!-- /TOC -->

# 架构
```
├── automl
│   ├── CONST.py
│   ├── autox.py
│   ├── ensemble
│   ├── feature_engineer
│   ├── feature_selection
│   ├── file_io
│   ├── join_tables
│   ├── metrics
│   ├── models
│   ├── process_data
│   └── util.py
├── data
│   └── data01
│   └── data02
├── run_demo.py
└── test
├── README.md
```

# 快速上手
- 全自动: run_oneclick.py
```
适合于想要快速获得一个不错结果的用户。只需要配置最少的数据信息,就能完成机器学习全流程的构建。
```
- 半自动: run_demo.ipynb
```
适合于想要获得更优预测结果的用户。AutoX提供了易用且丰富的接口,用户可以方便地根据实际数据场景进行配置,以获得更优的预测结果。
```

# 效果对比:
| index | data_name | AutoX | AutoGluon | H2o |
| ----- | ------------- | ---------------- | ----------------|----------------|
| 1 | zhidemai | 1.1426 | 1.9466 | 1.1927|


# 数据类型
- cat: Categorical,类别型无序变量
- ord: Ordinal,类别型有序变量
- num: Numeric,连续型变量
- datetime: Datetime型时间变量
- timestamp: imestamp型时间变量

# pipeline的逻辑
- 1.初始化AutoX类
```
1.1 读数据
1.2 合并train和test
1.3 识别数据表中列的类型
1.4 数据预处理
```
- 2.特征工程
```
特征工程包含单表特征和多表特征。
每一个特征工程类都包含以下功能:
一、自动筛选要执行当前操作的特征;
二、查看筛选出来的特征
三、修改要执行当前操作的特征
四、执行特征数据的计算,返回和主表样本条数以及顺序一致的特征
```
- 3.特征合并
```
将构造出来的特征进行合并,行数不变,列数增加,返回大的宽表
```
- 4.训练集和测试集的划分
```
将宽表划分成训练集和测试集
```
- 5.特征过滤
```
通过train和test的特征列数据分布情况,对构造出来的特征进行过滤,避免过拟合
```
- 6.模型训练
```
利用过滤后的宽表特征对模型进行训练
模型类提供功能包括:
一、查看模型默认参数;
二、模型训练;
三、模型调参;
四、查看模型对应的特征重要性;
五、模型预测
```
- 7.模型预测

# AutoX类
```
AutoX类自动为用户管理数据集和数据集信息。
初始化AutoX类之后会执行以下操作:
一、读数据;
二、合并train和test;
三、识别数据表中列的类型;
四、数据预处理。
```
## 属性
### info_: info_属性用于保存数据集的信息。
- info_['id']: List,用于标识数据表唯一的Key
- info_['target']: String,用于标识数据表的标签列
- info_['shape_of_train']: Int,train数据集的数据样本条数
- info_['shape_of_train']: Int,test数据集的数据样本条数
- info_['feature_type']: Dict of Dict,标识数据表中特征列的数据类型
- info_['train_name']: String,用于训练集主表表名
- info_['test_name']: String,用于测试集主表表名

### dfs_: dfs_属性用于保存所有的DataFrame,包含原始表数据和构造的表数据。
- dfs_['train_test']: train数据和test数据合并后的数据
- dfs_['FE_feature_name']:特征工程所构造出的数据,例如FE_count,FE_groupby
- dfs_['FE_all']:原始特征和所有特征工程合并后的数据集

## 方法
- concat_train_test: 将训练集和测试集拼接起来,一般在读取数据之后执行
- split_train_test: 将训练集和测试集分开,一般在完成特征工程之后执行
- get_submit: 获得预测结果(中间过程执行了完成的机器学习pipeline,包括数据预处理,特征工程,模型训练,模型调参,模型融合,模型预测等)

# AutoX的整个pipeline包含以下操作,让我们来了解一下其中的具体细节。

## 读数据
```
读取给定路径下的所有文件。默认情况下,会将训练集主表和测试集主表进行拼接,
再进行后续的数据预处理以及特征工程等操作,并在模型预测开始前,将训练集和测试进行拆分。
```

## 数据预处理
```
- 对时间列解析年, 月, 日, 时、星期几等信息
- 在每次训练前,会对输入到模型的数据删除无效(nunique为1)的特征
- 去除异常样本,去除label为nan的样本
```

## 特征工程
- 1-1拼表特征
```
```

- 1-M拼表特征
```
- time diff特征
- 聚合统计类特征
```

- count特征
```
对要操作的特征列,将全体数据集中,和当前样本特征属性一致的样本计数作为特征
```

- target encoding特征

- 统计类特征
```
使用两层for训练提取统计类特征。
第一层for循环遍历所有筛选出来的分组特征(group_col),
第二层for循环遍历所有筛选出来的聚合特征(agg_col),
在第二层for循环中,
若遇到类别型特征,计算的统计特征为nunique,
若遇到数值型特征,计算的统计特征包括[median, std, sum, max, min, mean].
```

- shift特征


## 模型训练
```
AutoX目前支持以下模型,默认情况下,会对Lightgbm模型进行训练:
1. Lightgbm;
2. AutoX 深度神经网络。
```

## 模型融合
```
AutoX支持的模型融合方式包括一下两种,默认情况下,不进行融合。
1. Stacking;
2. Bagging。
```


# 比赛上分点总结:
criteo: 对于nunique很大的特征列,进行分桶操作。例如,对于nunique大于10000的特征,做hash后截断保留4位,再进行label_encode。


## 错误排查
|错误信息|解决办法|
|------|------|


+ 7
- 0
autox/CONST.py View File

@@ -0,0 +1,7 @@

FEATURE_TYPE = {}
FEATURE_TYPE['num']= 'num'
FEATURE_TYPE['cat']= 'cat'
FEATURE_TYPE['ord']= 'ord'
FEATURE_TYPE['datetime']= 'datetime'
FEATURE_TYPE['timestamp']= 'timestamp'

+ 1
- 0
autox/__init__.py View File

@@ -0,0 +1 @@
from .autox import AutoX

BIN
autox/__pycache__/CONST.cpython-37.pyc View File


BIN
autox/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/__pycache__/autox.cpython-37.pyc View File


BIN
autox/__pycache__/util.cpython-37.pyc View File


+ 113
- 0
autox/autox.py View File

@@ -0,0 +1,113 @@
from .feature_engineer.fe_count import FeatureCount
from .feature_engineer.fe_stat import FeatureStat
from .file_io.read_data import read_data_from_path
from .models.regressor import CrossLgbRegression
from .process_data import feature_combination, train_test_divide, clip_label
from .process_data import feature_filter
from .process_data.feature_type_recognition import Feature_type_recognition
from .util import log

class AutoX():
def __init__(self, target, train_name, test_name, path, feature_type = {}, id = []):
self.info_ = {}
self.info_['id'] = id
self.info_['target'] = target
self.info_['feature_type'] = feature_type
self.info_['train_name'] = train_name
self.info_['test_name'] = test_name
self.dfs_ = read_data_from_path(path)
self.info_['max_target'] = self.dfs_[train_name][target].max()
self.info_['min_target'] = self.dfs_[train_name][target].min()
self.concat_train_test()
if feature_type == {}:
for table_name in self.dfs_.keys():
df = self.dfs_[table_name]
feature_type_recognition = Feature_type_recognition()
feature_type = feature_type_recognition.fit(df)
self.info_['feature_type'][table_name] = feature_type

self.dfs_['FE_all'] = None
self.sub = None

def concat_train_test(self):
self.info_['shape_of_train'] = len(self.dfs_[self.info_['train_name']])
self.info_['shape_of_test'] = len(self.dfs_[self.info_['test_name']])
self.dfs_['train_test'] = self.dfs_[self.info_['train_name']].append(self.dfs_[self.info_['test_name']])
self.dfs_['train_test'].index = range(len(self.dfs_['train_test']))

feature_type_train_test = {}
for col in self.dfs_['train_test'].columns:
if col in self.info_['feature_type'][self.info_['train_name']]:
feature_type_train_test[col] = self.info_['feature_type'][self.info_['train_name']][col]
else:
feature_type_train_test[col] = self.info_['feature_type'][self.info_['test_name']][col]
self.info_['feature_type']['train_test'] = feature_type_train_test

def split_train_test(self):
self.dfs_['FE_train'] = self.dfs_['FE_all'][:self.info_['shape_of_train']]
self.dfs_['FE_test'] = self.dfs_['FE_all'][self.info_['shape_of_train']:]

def get_submit(self):

id_ = self.info_['id']
target = self.info_['target']

# 特征工程
log("start feature engineer")
df = self.dfs_['train_test']
feature_type = self.info_['feature_type']['train_test']

# 统计特征
log("feature engineer: Stat")
featureStat = FeatureStat()
featureStat.fit(df, df_feature_type=feature_type, silence_group_cols= id_ + [target],
silence_agg_cols= id_ + [target], select_all=False)
self.dfs_['FE_stat'] = featureStat.transform(df)
log(f"featureStat ops: {featureStat.get_ops()}")

# count特征
log("feature engineer: Count")
featureCount = FeatureCount()
featureCount.fit(df, degree=2, df_feature_type=feature_type, silence_cols= id_ + [target], select_all=False)
self.dfs_['FE_count'] = featureCount.transform(df)
log(f"featureCount ops: {featureCount.get_ops()}")

# 特征合并
log("feature combination")
df_list = [df, self.dfs_['FE_count'], self.dfs_['FE_stat']]
self.dfs_['FE_all'] = feature_combination(df_list)

# train和test数据切分
train_length = self.info_['shape_of_train']
train, test = train_test_divide(self.dfs_['FE_all'], train_length)
log(f"shape of FE_all: {self.dfs_['FE_all'].shape}, shape of train: {train.shape}, shape of test: {test.shape}")

# 特征过滤
log("feature filter")
used_features = feature_filter(train, test, id_, target)
log(f"used_features: {used_features}")

# 模型训练
log("start training model")
crossLgbRegression = CrossLgbRegression()
crossLgbRegression.fit(train[used_features], train[target], Early_Stopping_Rounds=100, N_round=4000, Verbose=50)

# 特征重要性
fimp = crossLgbRegression.feature_importances_
log("feature importance")
log(fimp)

# 模型预测
predict = crossLgbRegression.predict(test[used_features])

# 预测结果后处理
min_ = self.info_['min_target']
max_ = self.info_['max_target']
predict = clip_label(predict, min_, max_)

# 获得结果
sub = test[id_]
sub[target] = predict
sub.index = range(len(sub))

return sub

+ 1
- 0
autox/ensemble/__init__.py View File

@@ -0,0 +1 @@
from .stacking import StackingRegressor

BIN
autox/ensemble/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/ensemble/__pycache__/stacking.cpython-37.pyc View File


+ 48
- 0
autox/ensemble/stacking.py View File

@@ -0,0 +1,48 @@
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np


class StackingRegressor():
def __init__(self, regressors, meta_regressor, n_fold=5):
self.regressors = regressors
self.fitted_regressors = []
self.meta_regressor = meta_regressor
self.train_meta = pd.DataFrame()
self.test_meta = pd.DataFrame()
for i in range(len(regressors)):
self.train_meta[f"meta_feature_{i}"] = 0
self.test_meta[f"meta_feature_{i}"] = 0
self.n_fold = n_fold
folds = KFold(n_splits=self.n_fold, shuffle=True, random_state=889)
self.folds = folds

def fit(self, X, y):
self.train_meta = pd.DataFrame(np.zeros([X.shape[0], len(self.regressors)]))
self.train_meta.columns = [f"meta_feature_{i}" for i in range(1, len(self.regressors)+1)]
for idx, cur_regressor in enumerate(self.regressors):
cur_fitted_regressors = []
for fold_n, (train_index, valid_index) in enumerate(self.folds.split(X)):
print('Training on fold {}'.format(fold_n + 1))
clf = cur_regressor.fit(X, y)
cur_fitted_regressors.append(clf)
val = clf.predict(X.iloc[valid_index])
self.train_meta.loc[valid_index, f"meta_feature_{idx+1}"] = val
self.fitted_regressors.append(cur_fitted_regressors)
self.meta_regressor.fit(self.train_meta, y)

def predict(self, X):
self.test_meta = pd.DataFrame(np.zeros([X.shape[0], len(self.regressors)]))
self.test_meta.columns = [f"meta_feature_{i}" for i in range(1, len(self.regressors) + 1)]
for idx, cur_fitted_regressors in enumerate(self.fitted_regressors):
for i, cur_fitted_regressor in enumerate(cur_fitted_regressors):
if i == 0:
pred = cur_fitted_regressor.predict(X) / float(self.n_fold)
else:
pred += cur_fitted_regressor.predict(X) / float(self.n_fold)
self.test_meta[f'meta_feature_{idx+1}'] = pred
self.result = self.meta_regressor.predict(self.test_meta)



+ 2
- 0
autox/feature_engineer/__init__.py View File

@@ -0,0 +1,2 @@
from .fe_count import FeatureCount
from .fe_stat import FeatureStat

BIN
autox/feature_engineer/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/feature_engineer/__pycache__/fe_count.cpython-37.pyc View File


BIN
autox/feature_engineer/__pycache__/fe_stat.cpython-37.pyc View File


+ 86
- 0
autox/feature_engineer/fe_count.py View File

@@ -0,0 +1,86 @@
from tqdm import tqdm
from autox.process_data import Feature_type_recognition
from ..CONST import FEATURE_TYPE

class FeatureCount:
def __init__(self):
self.target = None
self.df_feature_type = None
self.silence_cols = []
self.select_all = None
self.max_num = None
self.ops = []

def fit(self, df, degree=1, target = None, df_feature_type = None, silence_cols = [], select_all = True,
max_num = None):

assert(degree == 1 or degree == 2)

self.target = target
self.df_feature_type = df_feature_type
self.silence_cols = silence_cols
self.select_all = select_all
self.max_num = max_num

if self.df_feature_type is None:
feature_type_recognition = Feature_type_recognition()
feature_type = feature_type_recognition.fit(df)
self.df_feature_type = feature_type

for feature in self.df_feature_type.keys():
if self.df_feature_type[feature] == FEATURE_TYPE['cat'] and feature not in self.silence_cols:
self.ops.append([feature])

if not self.select_all:
if self.target is not None:
# 训练模型,对group_col进行筛选
pass
else:
# 通过统计信息进行筛选
del_count_cols = []
for count_col in self.ops:
if df.drop_duplicates(count_col).shape[0] > df.shape[0] * 0.2:
del_count_cols.append(count_col)
for count_col in del_count_cols:
self.ops.remove(count_col)

if degree == 2:
ops_degree_1 = self.ops
ops = []
for col_1 in ops_degree_1:
for col_2 in ops_degree_1:
if col_1 == col_2:
continue
else:
ops.append(col_1 + col_2)
self.ops = ops + ops_degree_1

def get_ops(self):
return self.ops

def set_keys(self, ops):
self.ops = ops

def transform(self, df):
name_list = []
for op in tqdm(self.ops):
if len(op) == 1:
name = f'COUNT_{"__".join(op)}'
name_list.append(name)
df[name] = df.groupby(op)[op].transform('count')
else:
col_1, col_2 = op
name = f'COUNT_{col_1}__{col_2}'
name_list.append(name)
df_map = df.groupby([col_1, col_2]).size().to_frame()
df_map.columns = [name]
df = df.merge(df_map, on=[col_1, col_2], how='left')
result = df[name_list]
df.drop(name_list, axis=1, inplace=True)
return result

def fit_transform(self, df, target = None, df_feature_type = None, silence_cols = [], select_all = True,
max_num = None):
self.fit(df, target=target, df_feature_type=df_feature_type, silence_cols=silence_cols,
select_all=select_all, max_num=max_num)
return self.transform(df)

+ 82
- 0
autox/feature_engineer/fe_stat.py View File

@@ -0,0 +1,82 @@
import pandas as pd
from ..CONST import FEATURE_TYPE
from autox.process_data import Feature_type_recognition
from tqdm import tqdm

class FeatureStat:
def __init__(self):
self.target = None
self.df_feature_type = None
self.silence_group_cols = []
self.silence_agg_cols = []
self.select_all = None
self.max_num = None
self.ops = {}
self.op_list_cat = ['nunique']
self.op_list_num = ['mean', 'min', 'max', 'mean', 'std']

def fit(self, df, target=None, df_feature_type=None, silence_group_cols=[], silence_agg_cols=[],
select_all=True, max_num=None):
self.target = target
self.df_feature_type = df_feature_type
self.silence_group_cols = silence_group_cols
self.silence_agg_cols = silence_agg_cols
self.select_all = select_all
self.max_num = max_num
if self.df_feature_type is None:
feature_type_recognition = Feature_type_recognition()
feature_type = feature_type_recognition.fit(df)
self.df_feature_type = feature_type

for group_col in self.df_feature_type.keys():
if self.df_feature_type[group_col] == FEATURE_TYPE['cat'] and group_col not in self.silence_group_cols:
self.ops[(group_col)] = {}
for agg_col in self.df_feature_type.keys():
if group_col == agg_col:
continue
if agg_col not in self.silence_agg_cols:
if self.df_feature_type[agg_col] == FEATURE_TYPE['cat']:
self.ops[(group_col)][agg_col] = self.op_list_cat
if self.df_feature_type[agg_col] == FEATURE_TYPE['num']:
self.ops[(group_col)][agg_col] = self.op_list_num

if not self.select_all:
if self.target is not None:
# 训练模型,对group_col进行筛选
pass
else:
# 通过统计信息进行筛选
del_group_cols = []
for group_col in self.ops.keys():
if df[group_col].nunique() > df.shape[0] * 0.2:
del_group_cols.append(group_col)
for group_col in del_group_cols:
del self.ops[group_col]

def get_ops(self):
return self.ops

def set_ops(self, ops):
self.ops = ops

def transform(self, df):
result = pd.DataFrame()
for group_col in tqdm(self.ops.keys()):
agg_cols = self.ops[group_col].keys()
for agg_col in agg_cols:
stats = self.ops[group_col][agg_col]
for stat_op in stats:
cur_result = df.groupby(group_col)[agg_col].transform(stat_op)
if type(group_col) == tuple:
name = f'{"__".join(group_col)}__{agg_col}__{stat_op}'
else:
name = f'{group_col}__{agg_col}__{stat_op}'
result[name] = cur_result
return result

def fit_transform(self, df, target=None, df_feature_type=None, silence_group_cols=[], silence_agg_cols=None,
select_all=True, max_num=None):
self.fit(df, target=target, df_feature_type=df_feature_type, silence_group_cols=silence_group_cols,
silence_agg_cols=silence_agg_cols, select_all=select_all, max_num=max_num)
return self.transform(df)

+ 83
- 0
autox/feature_engineer/fe_target_encoding.py View File

@@ -0,0 +1,83 @@
import pandas as pd
from ..CONST import FEATURE_TYPE
from autox.process_data import Feature_type_recognition
import numpy as np

def FE_target_encoding(train, test, keys, col_label, k = 5):
oof_train, oof_test = np.zeros(train.shape[0]), np.zeros(test.shape[0])
from sklearn.model_selection import KFold
skf = KFold(n_splits = k).split(train)
for i, (train_idx, valid_idx) in enumerate(skf):
df_train = train[keys + [col_label]].loc[train_idx]
df_valid = train[keys].loc[valid_idx]
df_map = df_train.groupby(keys)[[col_label]].agg('mean')
oof_train[valid_idx] = df_valid.merge(df_map, on = keys, how = 'left')[col_label].fillna(-1).values
oof_test += test[keys].merge(df_map, on = keys, how = 'left')[col_label].fillna(-1).values / k
return oof_train, oof_test

class FeatureTargetEncoding:
def __init__(self):
self.target = None
self.df_feature_type = None
self.silence_cols = []
self.select_all = None
self.max_num = None
self.ops = []

def fit(self, df, target, df_feature_type = None, silence_cols = [], select_all = True,
max_num = None):

self.target = target
self.df_feature_type = df_feature_type
self.silence_cols = silence_cols
self.select_all = select_all
self.max_num = max_num

if self.df_feature_type is None:
feature_type_recognition = Feature_type_recognition()
feature_type = feature_type_recognition.fit(df)
self.df_feature_type = feature_type

for feature in self.df_feature_type.keys():
if self.df_feature_type[feature] == FEATURE_TYPE['cat']:
self.ops.append([feature])

if not self.select_all:
if self.target is not None:
# 训练模型,对group_col进行筛选
pass
else:
# 通过统计信息进行筛选
del_count_cols = []
for count_col in self.ops:
if df.drop_duplicates(count_col).shape[0] > df.shape[0] * 0.1:
del_count_cols.append(count_col)
for count_col in del_count_cols:
self.ops.remove(count_col)

def get_ops(self):
return self.ops

def set_keys(self, ops):
self.ops = ops

def transform(self, df):
col_target = self.target
result = pd.DataFrame()
for keys in self.ops:
name = f'TARGET_ENCODING_{"__".join(keys)}'
train = df[~df[col_target].isnull()]
test = df[df[col_target].isnull()]
oof_train, oof_test = FE_target_encoding(train, test, keys, col_target, k = 5)
train[name] = oof_train
test[name] = oof_test
result[name] = pd.concat([train[name], test[name]], axis = 0).loc[df.index]
return result

def fit_transform(self, df, target, df_feature_type = None, silence_cols = None, select_all = True,
max_num = None):
self.fit(df, target=target, df_feature_type=df_feature_type, silence_cols=silence_cols,
select_all=select_all, max_num=max_num)
return self.transform(df)

+ 18
- 0
autox/feature_selection/Feature_selction.py View File

@@ -0,0 +1,18 @@
#
# class Feature_selection():
# def __init__(self, method):
# self.method = method
#
# def fit(self, X, y):
# if self.method == 'lightgbm':
# model = lightgbm
# model.fit(X, y)
# model.importance =
#
# def transform(self):
#
# def fit_transform(self, X, y):
# return self.fit(X, y).transform(X)
#
#
#

+ 0
- 0
autox/feature_selection/__init__.py View File


+ 1
- 0
autox/file_io/__init__.py View File

@@ -0,0 +1 @@
from .read_data import read_data_from_path

BIN
autox/file_io/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/file_io/__pycache__/read_data.cpython-37.pyc View File


+ 17
- 0
autox/file_io/read_data.py View File

@@ -0,0 +1,17 @@
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
from ..util import log

def read_data_from_path(path, file_type='csv'):
G_df_dict = {}
files = os.listdir(path)
files = [x for x in files if x.endswith('.' + file_type)]
for item in files:
log('[+] read {}'.format(item))
df = pd.read_csv(os.path.join(path, item))
log('table = {}, shape = {}'.format(item, df.shape))
name = item
G_df_dict[name] = df
return G_df_dict

+ 0
- 0
autox/join_tables/__init__.py View File


+ 0
- 0
autox/metrics/__init__.py View File


+ 1
- 0
autox/models/__init__.py View File

@@ -0,0 +1 @@
from .regressor import CrossLgbRegression

BIN
autox/models/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/models/__pycache__/regressor.cpython-37.pyc View File


+ 86
- 0
autox/models/regressor.py View File

@@ -0,0 +1,86 @@
import datetime
from time import time
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from ..util import log

class CrossLgbRegression(object):
def __init__(self, params=None, n_fold=5):
self.models = []
self.feature_importances_ = pd.DataFrame()
self.n_fold = n_fold
self.params_ = {
'objective': 'regression',
'metric': 'mse',
'boosting': 'gbdt',
'learning_rate': 0.01,
'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'max_depth': 5,
'verbose': -1
}
if params is not None:
self.params_ = params
self.Early_Stopping_Rounds = 30
self.N_round = 600
self.Verbose = 10

def get_params(self):
return self.params_

def set_params(self, params):
self.params_ = params

def fit(self, X, y, Early_Stopping_Rounds=None, N_round=None, Verbose=None):
log(X.shape)
if Early_Stopping_Rounds is not None:
self.Early_Stopping_Rounds = Early_Stopping_Rounds
if N_round is not None:
self.N_round = N_round
if Verbose is not None:
self.Verbose = Verbose

folds = KFold(n_splits=self.n_fold, shuffle=True, random_state=889)
MSEs = []
self.feature_importances_['feature'] = X.columns

for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):

start_time = time()
print('Training on fold {}'.format(fold_n + 1))

trn_data = lgb.Dataset(X.iloc[train_index],
label=y.iloc[train_index], categorical_feature="")
val_data = lgb.Dataset(X.iloc[valid_index],
label=y.iloc[valid_index], categorical_feature="")
clf = lgb.train(self.params_, trn_data, num_boost_round=self.N_round, valid_sets=[trn_data, val_data],
verbose_eval=self.Verbose,
early_stopping_rounds=self.Early_Stopping_Rounds)
self.models.append(clf)
self.feature_importances_['fold_{}'.format(fold_n + 1)] = clf.feature_importance()
val = clf.predict(X.iloc[valid_index])
mse_ = mean_squared_error(y.iloc[valid_index], val)
print('MSE: {}'.format(mse_))
MSEs.append(mse_)
print('Fold {} finished in {}'.format(fold_n + 1, str(datetime.timedelta(
seconds=time() - start_time))))
self.feature_importances_['average'] = self.feature_importances_[
[x for x in self.feature_importances_.columns if x != "feature"]].mean(axis=1)
self.feature_importances_ = self.feature_importances_.sort_values(by="average", ascending=False)
self.feature_importances_.index = range(len(self.feature_importances_))

def predict(self, test):
for idx, clf in enumerate(self.models):
if idx == 0:
result = clf.predict(test) / self.n_fold
else:
result += clf.predict(test) / self.n_fold
return result

+ 6
- 0
autox/process_data/__init__.py View File

@@ -0,0 +1,6 @@
from .feature_type_recognition import Feature_type_recognition
from .auto_label_encoder import auto_label_encoder
from .feature_combination import feature_combination
from .feature_filter import feature_filter
from .train_test_divide import train_test_divide
from .clip_label import clip_label

BIN
autox/process_data/__pycache__/__init__.cpython-37.pyc View File


BIN
autox/process_data/__pycache__/auto_label_encoder.cpython-37.pyc View File


BIN
autox/process_data/__pycache__/concat_train_test.cpython-37.pyc View File


BIN
autox/process_data/__pycache__/feature_combination.cpython-37.pyc View File


BIN
autox/process_data/__pycache__/feature_filter.cpython-37.pyc View File


BIN
autox/process_data/__pycache__/feature_type_recognition.cpython-37.pyc View File


+ 29
- 0
autox/process_data/auto_label_encoder.py View File

@@ -0,0 +1,29 @@
import pandas as pd
from tqdm import tqdm
from ..util import log

def auto_label_encoder(df, df_feature_type=None,silence_cols=[]):
df_copy = df.copy()
label_encoder_list = []
if df_feature_type is not None:
for f in tqdm(df_feature_type.keys()):
if f in silence_cols:
continue
if df_feature_type[f] == 'cat':
label_encoder_list.append(f)
temp = pd.DataFrame(df_copy[f].astype(str))
temp.index = range(len(temp))
temp[f] = temp[[f]].apply(lambda x: x.astype('category').cat.codes)
df_copy[f] = temp[f].values
else:
for f in tqdm(df_copy.columns):
if silence_cols is not None and f in silence_cols:
continue
if 'O' == df[f].dtype:
label_encoder_list.append(f)
temp = pd.DataFrame(df_copy[f].astype(str))
temp.index = range(len(temp))
temp[f] = temp[[f]].apply(lambda x: x.astype('category').cat.codes)
df_copy[f] = temp[f].values
log(label_encoder_list)
return df_copy

+ 5
- 0
autox/process_data/clip_label.py View File

@@ -0,0 +1,5 @@
import numpy as np

def clip_label(pred, min_, max_):
pred = np.clip(pred, min_, max_)
return pred

+ 0
- 0
autox/process_data/concat_train_test.py View File


+ 10
- 0
autox/process_data/feature_combination.py View File

@@ -0,0 +1,10 @@
import pandas as pd

def feature_combination(df_list):
result = df_list[0]
for df in df_list[1:]:
if df.shape[0] == 0:
continue
assert(result.shape[0] == df.shape[0])
result = pd.concat([result, df], axis=1)
return result

+ 20
- 0
autox/process_data/feature_filter.py View File

@@ -0,0 +1,20 @@
from tqdm import tqdm
from ..util import log

def feature_filter(train, test, id_, target):
not_used = id_ + [target]

# 过滤掉test中全为nan的特征
for col in tqdm(test.columns):
# test中全为Nan的特征
if test.loc[test[col].isnull()].shape[0] == test.shape[0]:
if col not in not_used:
not_used += [col]

# test中的值都比train中的值要大(或小)的特征
if test[col].min() > train[col].max() or test[col].max() < train[col].min():
if col not in not_used:
not_used += [col]
log(f"filtered features: {not_used}")
used_features = [x for x in list(train.describe().columns) if x not in not_used]
return used_features

+ 51
- 0
autox/process_data/feature_type_recognition.py View File

@@ -0,0 +1,51 @@
import pandas as pd
import datetime

from autox.CONST import FEATURE_TYPE

# datetime的表现形式为:2015-08-28 16:43:37.283
# timestamp的表现形式为:1440751417.283

def detect_TIMESTAMP(df, col):
try:
ts_min = int(float(df.loc[~(df[col] == '') & (df[col].notnull()), col].min()))
ts_max = int(float(df.loc[~(df[col] == '') & (df[col].notnull()), col].max()))
datetime_min = datetime.datetime.utcfromtimestamp(ts_min).strftime('%Y-%m-%d %H:%M:%S')
datetime_max = datetime.datetime.utcfromtimestamp(ts_max).strftime('%Y-%m-%d %H:%M:%S')
if datetime_min > '2000-01-01 00:00:01' and datetime_max < '2030-01-01 00:00:01' and datetime_max > datetime_min:
return True
except:
return False

def detect_DATETIME(df, col):
is_DATETIME = False
if df[col].dtypes == 'object':
is_DATETIME = True
try:
pd.to_datetime(df[col])
except:
is_DATETIME = False
return is_DATETIME

def get_data_type(df, col):
if detect_DATETIME(df, col):
return FEATURE_TYPE['datetime']
if detect_TIMESTAMP(df, col):
return FEATURE_TYPE['timestamp']
if df[col].dtypes == object or df[col].dtypes == bool:
return FEATURE_TYPE['cat']
if 'int' in str(df[col].dtype) or 'float' in str(df[col].dtype):
return FEATURE_TYPE['num']

class Feature_type_recognition():
def __init__(self):
self.df = None
self.feature_type = None

def fit(self, df):
self.df = df
self.feature_type = {}
for col in self.df.columns:
cur_type = get_data_type(self.df, col)
self.feature_type[col] = cur_type
return self.feature_type

+ 5
- 0
autox/process_data/train_test_divide.py View File

@@ -0,0 +1,5 @@

def train_test_divide(train_test, train_length):
train = train_test[:train_length]
test = train_test[train_length:]
return train, test

+ 25
- 0
autox/util.py View File

@@ -0,0 +1,25 @@
import warnings
warnings.filterwarnings('ignore')

# log
import logging
LOGGER = logging.getLogger('run-time-adaptive_automl')
LOG_LEVEL = 'INFO'
# LOG_LEVEL = 'DEBUG'
LOGGER.setLevel(getattr(logging, LOG_LEVEL))
simple_formatter = logging.Formatter('%(levelname)7s -> %(message)s')
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
console_handler.setFormatter(simple_formatter)
LOGGER.addHandler(console_handler)
LOGGER.propagate = False
nesting_level = 0

def log(entry, level='info'):
if level not in ['debug', 'info', 'warning', 'error']:
LOGGER.error('Wrong level input')

global nesting_level
space = '-' * (4 * nesting_level)

getattr(LOGGER, level)(f"{space} {entry}")

+ 0
- 0
demo/__init__.py View File


+ 65
- 0
run_oneclick.py View File

@@ -0,0 +1,65 @@
from autox import AutoX

data_name = 'zhidemai'
path = f'./data/{data_name}'

feature_type = {
'train.csv': {
'article_id': 'cat',
'date': 'num',
'baike_id_1h': 'cat',
'price': 'num',
'price_diff': 'num',
'author': 'cat',
'level1': 'cat',
'level2': 'cat',
'level3': 'cat',
'level4': 'cat',
'brand': 'cat',
'mall': 'cat',
'url': 'cat',
'comments_1h': 'num',
'zhi_1h': 'num',
'buzhi_1h': 'num',
'favorite_1h': 'num',
'orders_1h': 'num',
'baike_id_2h': 'cat',
'comments_2h': 'num',
'zhi_2h': 'num',
'buzhi_2h': 'num',
'favorite_2h': 'num',
'orders_2h': 'num',
'orders_3h_15h': 'num'
},
'test.csv': {
'article_id': 'cat',
'date': 'num',
'baike_id_1h': 'cat',
'price': 'num',
'price_diff': 'num',
'author': 'cat',
'level1': 'cat',
'level2': 'cat',
'level3': 'cat',
'level4': 'cat',
'brand': 'cat',
'mall': 'cat',
'url': 'cat',
'comments_1h': 'num',
'zhi_1h': 'num',
'buzhi_1h': 'num',
'favorite_1h': 'num',
'orders_1h': 'num',
'baike_id_2h': 'cat',
'comments_2h': 'num',
'zhi_2h': 'num',
'buzhi_2h': 'num',
'favorite_2h': 'num',
'orders_2h': 'num'
}
}

autox = AutoX(target ='orders_3h_15h', train_name ='train.csv', test_name ='test.csv',
id = ['article_id'], path = path, feature_type = feature_type)
submit = autox.get_submit()
submit.to_csv("./submit.csv", index=False)

+ 24
- 0
setup.py View File

@@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from distutils.core import setup
from setuptools import find_packages

setup(name="autox",
version="0.1.0",
description="automl for competition",
author="caihengxing, zhongrunxing",
author_email="caihengxing@4paradigm.com",
install_requires=[
'lightgbm',
'numpy',
'pandas',
'sklearn',
'tqdm'
],
python_requires='>=3.6',
# packages=[],
packages=find_packages(exclude=['data','demo','sub','test','README.md','run_oneclick.py']),
#ext_modules=cythonize("cli_examples/primes.pyx"), #Cython extension demo
#package_data={"cli_examples": ["prmies.pyx"]}, #force sdist to keep the .pyx files
include_package_data=True,
zip_safe=False # not install as zip file
)

+ 201
- 0
sub/sub20210701.csv View File

@@ -0,0 +1,201 @@
article_id,sales_volume
522,
738,
741,
661,
412,
679,
627,
514,
860,
137,
812,
77,
637,
974,
939,
900,
281,
884,
762,
320,
550,
175,
372,
528,
211,
236,
102,
987,
903,
948,
347,
140,
622,
500,
371,
199,
688,
585,
902,
60,
329,
97,
313,
975,
300,
278,
925,
602,
440,
838,
571,
880,
262,
579,
24,
31,
618,
11,
222,
821,
297,
55,
543,
210,
605,
693,
663,
867,
71,
544,
108,
494,
591,
742,
293,
290,
653,
40,
590,
308,
680,
67,
276,
68,
319,
549,
999,
715,
754,
328,
383,
452,
523,
219,
788,
437,
765,
89,
64,
827,
717,
352,
937,
257,
636,
645,
555,
960,
169,
918,
529,
824,
986,
817,
87,
433,
185,
979,
535,
295,
893,
426,
714,
261,
238,
560,
584,
446,
868,
801,
600,
850,
266,
996,
530,
56,
121,
216,
26,
73,
45,
248,
722,
282,
894,
915,
811,
245,
823,
322,
644,
159,
978,
430,
942,
463,
310,
698,
61,
885,
596,
768,
650,
651,
866,
669,
299,
690,
315,
311,
362,
480,
111,
990,
487,
364,
255,
260,
803,
678,
495,
671,
378,
527,
846,
138,
356,
366,
943,
750,
949,
830,
657,
200,
214,
409,
333,
209,
614,
79,

+ 0
- 0
test/__init__.py View File


Loading…
Cancel
Save