|
- import pandas as pd
- import numpy as np
- import math
- import matplotlib.pyplot as plt
- import argparse
- from sklearn import linear_model
-
-
- class LRRCA():
-
- def __init__(self, nid, data, csv_path, debug, fig):
- '''
- 初始化算法模型。
- 注:请详细说明输入和输出的类型,以及输入输出值的含义。
- :参数 nid: 测试节点编号
- :参数 data: 输入数据的文件名
- :参数 csv_path: 输出数据指定目录
- :参数 debug: 布尔值,若为真,则会在TERMINAL打印小样本真实值和优化前后预测值的数值示例
- :param fig: 布尔值,若为真,则会在'./figure/'目录下保存小样本真实值和优化前后预测值的曲线对比图
- :return: 无
- ---
-
- '''
- self.nid = nid
- self.data = data
- self.csv_path = csv_path
- self.debug = debug
- self.fig = fig
- self.method = 'LRRCA'
-
- def __call__(self):
- '''
- 对图像进行检测
- 注:请详细说明输入和输出的类型,以及输入输出值的含义。
- Arg
- ---
- 无
- Return
- ---
- 无
- '''
- # Reading data
- data = pd.read_hdf(self.data)
- print('| 从<{}>读取文件中...'.format(self.data))
-
- # Processing data
- T, N = data.shape
- flow = data.values
- t1, t2 = math.ceil(T * 0.7), math.ceil(T * 0.8)
- x_train = flow[0: t1, :]
- x_test = flow[t2:, :]
- node_ids = data.columns
- ypred_ha = np.zeros(shape=(len(x_test), N))
-
- # Split Data
- print('| 正在拆分数据...')
- groundtruth = data.iloc[t2:, :]
- groundtruth_file_path = self.csv_path + '/groundtruth.csv'
- print('| 预测真实值导出到<{}>...'.format(groundtruth_file_path))
- groundtruth.to_csv(groundtruth_file_path)
- print('| ---------')
-
- # Baseline
- print('| 优化前方法预测中....')
- baseline = self.ha(data, t1, t2, T)
- ha_file_path = self.csv_path + '/baseline.csv'
- print('| 优化前预测值导出到<{}>...'.format(ha_file_path))
- baseline.to_csv(ha_file_path)
- MAE_baseline = self.MAE(pred=baseline, label=groundtruth)
- print('| 优化前平均绝对误差值为{:.2f}'.format(MAE_baseline))
- print('| ---------')
-
- # Optimized
-
- print('| 优化后方法预测中...')
- optimized = self.kernal(data, t1, t2, T, N)
- optimized_file_path = self.csv_path + '/{}.csv'.format(self.method)
- print('| 优化后预测值导出对 {}...'.format(optimized_file_path))
- optimized.to_csv(optimized_file_path)
- MAE_method = self.MAE(pred=optimized, label=groundtruth)
- print('| 优化后平均绝对误差值为{:.2f}'.format(MAE_method))
- print('| ---------')
-
- # Comparsion
- mean_MAE_gain = (MAE_baseline - MAE_method) / MAE_baseline
- print('| 预测准确率提升平均值为{:.2f}%'.format(mean_MAE_gain * 100))
-
-
- # Debug mode
- if self.debug:
- # Print out the small sample
- print('| [测试] 测试模式启动,正在输出小样本...')
- debug_start = 0
- debug_node = 10
-
- print('| [测试] 真实值:')
- groundtruth_small = groundtruth.iloc[debug_start: debug_start + 4, [1, -1]]
- print(groundtruth_small)
- print('| ---------')
-
- print('| [测试] 优化前值:')
- baseline_small = baseline.iloc[debug_start: debug_start + 4, [1, -1]]
- print(baseline_small)
- MAE_baseline_small = self.MAE(pred=baseline_small, label=groundtruth_small, debug=True)
- print('| ---------')
-
- print('| [测试] 优化后值:')
- optimized_small = optimized.iloc[debug_start: debug_start + 4, [1, -1]]
- print(optimized_small)
- MAE_method_small = self.MAE(pred=optimized_small, label=groundtruth_small, debug=True)
-
- # Draw figure
- if self.fig:
- self.draw_plot(groundtruth, baseline, optimized, self.nid, self.method)
- print()
-
- def ha(self, data, t1, t2, T, step=288*7):
- """
- :param data: data (pd.DataFrame)
- :param t1: train time split
- :param t2: test time split
- :param T: total timestampes
- :return:
- """
- ypred = data.iloc[t2:, :].copy()
-
- # Get the historical average
- for i in range(t2, min(T, t2 + step)):
- inds = [x for x in range(i % step, t1, step)]
- history = data.iloc[inds]
- ypred.iloc[i - t2] = history.mean()
-
- for i in range(t2 + step, T, step):
- size = min(step, T - i)
- start = i - t2
- ypred.iloc[start: start + size] = ypred.iloc[start - step: start + size - step].values
-
- return ypred
-
- def kernal(self, data, t1, t2, T, N):
- """
- :param data: data (pd.DataFrame)
- :param t1: train time split
- :param t2: test tiem split
- :param T: total timestep
- :param N: total number of nodes
- :return: pd.DataFrame (5126 * 136)
- """
- flow = data.values
- W = []
- K = 12
- ypred = data.iloc[t2:, :].copy()
- t2 = t2 - K
- x_train = flow[0: t1, :]
- x_test = flow[t2:, :]
-
- # LRRCA - train
- R = np.corrcoef(x_train.T)
- reg = linear_model.LinearRegression()
-
- for i in range(N):
- R[i, i] = 0
- neighbor = np.argmax(R[i, :])
- feature = []
- truth = np.zeros(len(x_train) - K)
-
- for j in range(K, len(x_train)):
- tmp1 = x_train[j - K: j, i]
- tmp2 = x_train[j - K: j, neighbor]
- feature.append([np.mean(tmp1), np.mean(tmp2)])
- truth[j - K] = x_train[j, i]
-
- reg.fit(feature, truth)
- coefs = list(reg.coef_)
- coefs.append(reg.intercept_)
- W.append(coefs)
- print('训练中: {}/{}\r'.format(i+1, N), end='', flush=True)
-
- # LRRCA - Test
- print()
- pre_total = np.zeros((N, len(x_test) - K))
- for i in range(N):
- neighbor = np.argmax(R[i, :])
- feature = []
- truth = np.zeros(len(x_test) - K)
-
- for j in range(K, len(x_test)):
- tmp1 = x_test[j - K: j, i]
- tmp2 = x_test[j - K: j, neighbor]
- feature.append([np.mean(tmp1), np.mean(tmp2)])
- truth[j - K] = x_test[j, i]
-
- reg.fit(feature, truth)
- pre = np.dot(feature, W[i][:2]) + W[i][-1]
- pre_total[i] = pre
- print('测试中: {}/{}\r'.format(i+1, N), end='', flush=True)
-
- print()
- ret = pd.DataFrame(pre_total)
- ret = ret.T
- ret.index = ypred.index
- ret.columns = ypred.columns
- return ret
-
- def MAE(self, pred, label, debug=False):
- """
- Mean Absolute Error: MAE = 1/m \Sigma_{1}^{M} |y_pred - y_real|
- :param pred:
- :param label:
- :return: a MAE value
- """
- assert pred.shape == label.shape, "Unmatched Dimension" # 判断是否可以相减
- error = abs(pred - label) # 计算error
- errors = error.values
- maes = np.mean(errors, axis=1) # 计算每个节点的error
- mae = np.mean(maes) # 计算平均error
- if debug:
- print('| [测试] 先计算每个测试记录与真实值的误差:')
- print(errors)
- print('| [测试] 对每个时间步误差求平均绝对误差:')
- print(maes)
- print('| [测试] 对所有节点求平均绝对误差:')
- print(mae)
-
- return mae
-
- def draw_plot(self, groundtruth, baseline, optimized, nid, title):
- """
- Draw the plot
- :param groundtruth: pd.DataFrame
- :param baseline: pd.DataFrame
- :param optimzied: pd.DataFrame
- :param nid: id to be plot
- """
- print('| 画图中...')
- fig, ax = plt.subplots(figsize=(20, 4))
- node_info_groundtruth = groundtruth[nid][0: 288]
- node_info_baseline = baseline[nid][0: 288]
- node_info_optimized = optimized[nid][0: 288]
- ax.plot(node_info_groundtruth, label='groundtruth')
- ax.plot(node_info_baseline, label='before optimization')
- ax.plot(node_info_optimized, label='after optimization')
- ax.legend()
- ax.set_title(title)
- ax.set_xlabel('Time')
- ax.set_ylabel('Volume')
- fig.savefig('figures/{}.png'.format(title))
-
- parser = argparse.ArgumentParser()
- parser.add_argument('--nid', type=int, default='20501805')
- parser.add_argument('--data', type=str, default='PCL-SZ-volume/sz_cleaned.h5')
- parser.add_argument('--csv_path', type=str, default='data')
- parser.add_argument('--debug', action='store_true', default=False)
- parser.add_argument('--fig', action='store_true', default=False)
- args = parser.parse_args()
- model = LRRCA(args.nid, args.data, args.csv_path, args.debug, args.fig)
- model()
|