logo
Loading...

資料處理與特徵工程-程式碼檔案 - 資料處理與特徵工程 - Cupoy

from sklearn.datasets import load_iris import pandas as pd import numpy as np import seaborn as sns ...

from sklearn.datasets import load_iris import pandas as pd import numpy as np import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import Binarizer from sklearn.ensemble import RandomForestClassifier from matplotlib import pyplot as plt from sklearn.feature_selection import SelectKBest, chi2 from sklearn.feature_selection import RFE # 輸入資料 data = load_iris() df = pd.DataFrame(data=data.data, columns=data.feature_names) df_y = data['target'] df.head() #count nan df.isnull().sum() #count 0 df.isin([0]).sum() max(df['sepal length (cm)']) min(df['sepal length (cm)']) np.mean(df['sepal length (cm)']) np.std(df['sepal length (cm)']) sns.boxplot(y=df['sepal length (cm)']) sns.histplot(df['sepal length (cm)']) df.drop_duplicates(keep='last') df.nunique() #算出現頻率,眾數 df['sepal length (cm)'].value_counts() #中位數 np.percentile(df['sepal length (cm)'], 50) scaler = StandardScaler() scaled_data = scaler.fit_transform(df) #One-Hot Encoding s = pd.Series(list('abca')) pd.get_dummies(s) #binarizer binarizer_1 = Binarizer(5) binarizer_1.fit_transform(df[['sepal length (cm)']]) #binning bins = [ 0, 5,10] pd.cut(df['sepal length (cm)'], bins,labels=["A", "B"]) rf = RandomForestClassifier() rf.fit(df, df_y) sorted_idx = rf.feature_importances_.argsort() plt.barh(df.columns, rf.feature_importances_[sorted_idx]) plt.xlabel("Random Forest Feature Importance") # chi-square KBest = SelectKBest(chi2, k=2).fit(df, df_y) cols = KBest.get_support() features_df_new = df.iloc[:,cols] #Recursive Feature Elimination rfe = RFE(RandomForestClassifier()) rfe = rfe.fit(df, df_y) f = rfe.get_support(2) #the most important features features_df_new = df.iloc[:,f]