遇到一個奇怪的問題,準確率會差很多,麻煩老師幫忙看看為什麼?
以下為執行程式碼 import pandas as pd import numpy as np import matplotlib.pyplot as plt from scipy.stats import norm from sklearn.model_selection import train_test_split from sklearn.model_selection import HalvingRandomSearchCV as HRSCV from sklearn.ensemble import RandomForestRegressor from eda_module import data_cleaning from eda_module import feature_engineering from eda_module import k_means_binning from eda_module import regression_report from eda_module import view_miss_data from eda_module import view_discrete_data from eda_module import view_continual_data from eda_module import dection_datatype from sklearn.cluster import KMeans import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore") train = pd.read_csv('./train.csv') test = pd.read_csv('./test.csv') total = pd.concat([train, test], axis = 0) total = data_cleaning(train, test) total = feature_engineering(total) # ------- StandardScaler ------- for k-mean from sklearn.preprocessing import StandardScaler int_features = [] float_features = [] object_features = [] for dtype, feature in zip(total.dtypes, total.columns): if dtype == 'float64': float_features.append(feature) elif dtype == 'int64': int_features.append(feature) else: # dtype == 'object': object_features.append(feature) c_cols = int_features + float_features sc = StandardScaler() for_km = total.copy() for i in c_cols: scalers = sc.fit_transform(for_km[[i]]) for_km[i] = pd.DataFrame(scalers) # 加上這個誤差就會變成10%,拿掉則會變回6% km = KMeans(n_clusters=10) y_pred = km.fit_predict(for_km) total['data_kmean'] = y_pred data_y = total['SalePrice'] ids = total['Id'] total.drop(columns = ['Id', 'SalePrice'], inplace = True) # print(len(total.columns)) X_train = total[:1095] y_train = data_y[:1095] X_test = total[1095:] y_test = data_y[1095:] RF = RandomForestRegressor() RF.fit(X_train, y_train) val_pred = RF.predict(X_test) report_number_list = regression_report(y_test, val_pred, True) 當我這樣執行,誤差為 mape = 0.0975 當我刪掉 for_km[i] = pd.DataFrame(scalers)這行,準確率則變為 mape = 0.0632 我找不出這是為什麼? 想請老師解惑
回答列表
-
2021/08/24 下午 04:11Lance贊同數:0不贊同數:0留言數:0
加入 **for_km[i] = pd.DataFrame(scalers)** 才會將標準化的結果存回去,而標準化會大幅度的影響 k-means 的效能,我目前是覺得因為如此才會讓誤差有這麼多的差距。
-
2021/08/25 上午 00:47Ma Chao Ting贊同數:0不贊同數:0留言數:2
抱歉~我在這邊沒有描述清楚 當我留著這一行 for_km[i] = pd.DataFrame(scalers) 但註解掉這一行 total['data_kmean'] = y_pred 這個時候mape就會變回mape = 0.0975 但只要註解掉for_km[i] = pd.DataFrame(scalers) 這一行留與不留 total['data_kmean'] = y_pred 就都會回到mape = 0.0632