##注入所需库
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import numpy as np
import time
import shap
# from sklearn.svm import SVC #支持向量机分类器
# # from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
# # from sklearn.linear_model import LogisticRegression #逻辑回归分类器
# import xgboost as xgb #XGBoost分类器
# import lightgbm as lgb #LightGBM分类器
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
# # from catboost import CatBoostClassifier #CatBoost分类器
# # from sklearn.tree import DecisionTreeClassifier #决策树分类器
# # from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
# from skopt import BayesSearchCV
# from skopt.space import Integer
# from deap import base, creator, tools, algorithms
# from sklearn.model_selection import StratifiedKFold, cross_validate # 引入分层 K 折和交叉验证工具
# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
from sklearn.metrics import make_scorer#定义函数
# import warnings #用于忽略警告信息
# warnings.filterwarnings("ignore") # 忽略所有警告信息
#聚类
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
#3D可视化
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
import plotly.graph_objects as go
# 导入 Pipeline 和相关预处理工具
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline # 用于创建机器学习工作流
from sklearn.compose import ColumnTransformer # 用于将不同的预处理应用于不同的列
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler # 用于数据预处理(有序编码、独热编码、标准化)
from sklearn.impute import SimpleImputer # 用于处理缺失值
#设置中文字体&负号正确显示
plt.rcParams['font.sans-serif']=['STHeiti']
plt.rcParams['axes.unicode_minus']=True
plt.rcParams['figure.dpi']=100
#读取数据
data=pd.read_csv('data.csv')
# #数据填补
# for i in data.columns:
# if data[i].dtype!='object':
# if data[i].isnull().sum()>0:
# data[i].fillna(data[i].mean(),inplace=True)
# else:
# if data[i].isnull().sum()>0:
# data[i].fillna(data[i].mode()[0],inplace=True)
# mapping={'10+ years':0,
# '9 years':1,
# '8 years':2,
# '7 years':3,
# '6 years':4,
# '5 years':5,
# '4 years':6,
# '3 years':7,
# '2 years':8,
# '1 year':9,
# '< 1 year':10}
# data['Years in current job']=data['Years in current job'].map(mapping)
# dummies_list=[]
# data2=pd.read_csv(r'data.csv')
# data=pd.get_dummies(data=data,drop_first=True)
# for i in data.columns:
# if i not in data2.columns:
# dummies_list.append(i)
# for i in dummies_list:
# data[i]=data[i].astype(int)
#划分数据集
from sklearn.model_selection import train_test_split
x=data.drop(columns=['Credit Default','Id'],axis=1)
y=data['Credit Default']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
# #SMOTE
# from imblearn.over_sampling import SMOTE
# smote=SMOTE(random_state=42)
# x_train_smote,y_train_smote=smote.fit_resample(x_train,y_train)
# #标准化数据
# scaler=StandardScaler()
# x_scaled=scaler.fit_transform(x)
# ##Kmeans++
# k_range=range(2,5)
# inertia_value=[]
# silhouette_scores=[]
# ch_scores=[]
# db_scores=[]
# start_time=time.time()
# for k in k_range:
# kmeans=KMeans(n_clusters=k,random_state=42)
# kmeans_label=kmeans.fit_predict(x_scaled)#提供了每个数据点所属的簇的信息,用于区分不同簇的数据点
# inertia_value.append(kmeans.inertia_)
# silhouette=silhouette_score(x_scaled,kmeans_label)
# silhouette_scores.append(silhouette)
# ch=calinski_harabasz_score(x_scaled,kmeans_label)
# ch_scores.append(ch)
# db=davies_bouldin_score(x_scaled,kmeans_label)
# db_scores.append(db)
# # print(f'k={k}\n 惯性:{kmeans.inertia_:.2f}\n轮廓系数:{silhouette:.3f}\n CH系数:{ch:.2f}\n DB{db:.3f}')
# end_time=time.time()
# print(f'聚类分析耗时:{end_time-start_time:.4f}')
# #绘制评估指标图
# plt.figure(figsize=(12,6))
# ##肘部法则图
# plt.subplot(2,2,1)
# plt.plot(k_range,inertia_value,marker='o')
# plt.title('肘部法则确定最优聚类数 k(惯性,越小越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('惯性')
# plt.grid(True)
# #轮廓系数图
# plt.subplot(2,2,2)
# plt.plot(k_range,silhouette_scores,marker='o',color='orange')
# plt.title('轮廓系数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('轮廓系数')
# plt.grid(True)
# ##CH指数图
# plt.subplot(2,2,3)
# plt.plot(k_range,ch_scores,marker='o',color='red')
# plt.title('Calinski-Harabasz 指数确定最优聚类数 k(越大越好)')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('CH 指数')
# plt.grid(True)
# ##DB指数
# plt.subplot(2,2,4)
# plt.plot(k_range,db_scores,marker='o',color='yellow')
# plt.xlabel('聚类数 (k)')
# plt.ylabel('DB 指数')
# plt.grid(True)
# plt.tight_layout()
# plt.show()
# #选择K值进行聚类
# selected_k=3
# kmeans=KMeans(n_clusters=selected_k,random_state=42)
# kmeans_label=kmeans.fit_predict(x_scaled)
# x['KMeans_Cluster']=kmeans_label
# ##PCA降维
# print(f"\n--- PCA 降维 ---")
# pca=PCA(n_components=3)
# x_pca=pca.fit_transform(x_scaled)
# ##聚类可视化
# plt.figure(figsize=(6,5))
# sns.scatterplot(
# x=x_pca[:,0],
# y=x_pca[:,1],
# hue=kmeans_label,
# palette='viridis'
# )
# plt.title(f'KMean Clustering with k={selected_k} (PCA Visualization)')
# plt.xlabel('PCA Component 1')
# plt.ylabel('PCA Component 2')
# plt.show()
# #3D可视化
# #准备数据
# df_pca=pd.DataFrame(x_pca)
# df_pca['cluster']=x['KMeans_Cluster']
# fig=px.scatter_3d(
# df_pca,
# x=0,
# y=1,
# z=2,
# color='cluster',
# color_continuous_scale=px.colors.sequential.Viridis,
# title='RFE特征选择的3D可视化'
# )
# fig.update_layout(
# scene=dict(
# xaxis_title='pca_0',
# yaxis_title='pca_1',
# zaxis_title='pca_2'
# ),
# width=1200,
# height=1000
# )
# fig.show()
# print(f"\n---t-SNE 降维 ---")
# n_components_tsne=3
# # 对训练集进行 fit_transform
# tsne=TSNE(
# n_components=n_components_tsne,
# perplexity=20,
# n_iter=5000,
# learning_rate='auto',
# random_state=42,
# n_jobs=-1
# )
# print("正在对训练集进行 t-SNE fit_transform...")
# start_tsne_fit_train = time.time()
# x_tsne=tsne.fit_transform(x_scaled)
# end_tsne_fit_train = time.time()
# print(f"训练集 t-SNE fit_transform 完成,耗时: {end_tsne_fit_train - start_tsne_fit_train:.2f} 秒")
# # #3D可视化
# #准备数据
# df_tsne=pd.DataFrame(x_tsne)
# df_tsne['cluster']=x['KMeans_Cluster']
# fig=px.scatter_3d(
# df_tsne,
# x=0,y=1,z=2,
# color='cluster',
# color_continuous_scale=px.colors.sequential.Viridis,
# title='T-SNE特征选择的3D可视化'
# )
# fig.update_layout(
# scene=dict(
# xaxis_title='pca_0',
# yaxis_title='pca_1',
# zaxis_title='pca_2'
# ),
# width=1200,
# height=1000
# )
# fig.show()
# ##打印KMeans聚类前几行
# print(f'KMeans Cluster labels(k={selected_k}added to x):')
# print(x[['KMeans_Cluster']].value_counts())
# start_time=time.time()
# rf1_model=RandomForestClassifier(random_state=42,class_weight='balanced')
# rf1_model.fit(x_train_smote,y_train_smote)
# explainer=shap.TreeExplainer(rf1_model)
# shap_value=explainer.shap_values(x)
# print(shap_value.shape)
# end_time=time.time()
# print(f'SHAP分析耗时:{end_time-start_time:.4f}')
# # --- 1. SHAP 特征重要性条形图 (Summary Plot - Bar) ---
# print("--- 1. SHAP 特征重要性条形图 ---")
# shap.summary_plot(shap_value[:,:,0],x,plot_type='bar',show=False)
# plt.title('shap feature importance (bar plot)')
# plt.tight_layout()
# plt.show()
# selected_features=['Purpose_debt consolidation','Home Ownership_Home Mortgage','Purpose_home improvements','Purpose_other']
# fig,axes=plt.subplots(2,2,figsize=(10,8))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# unique_count=x[feature].nunique()
# if unique_count<10:
# print(f'{feature}可能是离散型变量')
# else:
# print(f'{feature}可能是连续性变量')
# sns.countplot(x=x[feature],ax=axes[i])
# axes[i].set_title(f'histogram of{feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('frequency')
# plt.tight_layout()
# plt.show()
# print(x[['KMeans_Cluster']].value_counts())
# x_cluster0=x[x['KMeans_Cluster']==0]
# x_cluster1=x[x['KMeans_Cluster']==1]
# x_cluster2=x[x['KMeans_Cluster']==2]
# #簇0
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# sns.countplot(x=x_cluster0[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# plt.tight_layout()
# plt.show()
# #簇1
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# sns.countplot(x=x_cluster1[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# plt.tight_layout()
# plt.show()
# #簇2
# fig,axes=plt.subplots(2,2,figsize=(6,4))
# axes=axes.flatten()
# for i,feature in enumerate(selected_features):
# sns.countplot(x=x_cluster2[feature],ax=axes[i])
# axes[i].set_title(f'countplot of {feature}')
# axes[i].set_xlabel(feature)
# axes[i].set_ylabel('count')
# plt.tight_layout()
# plt.show()
# print("--- 递归特征消除 (RFE) ---")
# from sklearn.feature_selection import RFE
# base_model=RandomForestClassifier(random_state=42,class_weight='balanced')
# rfe=RFE(base_model,n_features_to_select=3)
# rfe.fit(x_train_smote,y_train_smote)
# x_train_rfe=rfe.transform(x_train_smote)
# x_test_rfe=rfe.transform(x_test)
# selected_features_rfe=x_train.columns[rfe.support_]
# print(f"RFE筛选后保留的特征数量: {len(selected_features_rfe)}")
# print(f"保留的特征: {selected_features_rfe}")
# #3D可视化
# x_selected=x[selected_features_rfe]
# df_viz=pd.DataFrame(x_selected)
# df_viz['cluster']=x['KMeans_Cluster']
# fig=px.scatter_3d(
# df_viz,
# x=selected_features_rfe[0],
# y=selected_features_rfe[1],
# z=selected_features_rfe[2],
# color='cluster',
# color_continuous_scale=px.colors.sequential.Viridis,
# title='RFE特征选择的3D可视化'
# )
# fig.update_layout(
# scene=dict(
# xaxis_title=selected_features_rfe[0],
# yaxis_title=selected_features_rfe[1],
# zaxis_title=selected_features_rfe[2]
# ),
# width=1200,
# height=1000
# )
# fig.show()
# #训练随机森林模型
# rf_model_rfe=RandomForestClassifier(random_state=42,class_weight='balanced')
# rf_model_rfe.fit(x_train_rfe,y_train_smote)
# rf_pred_rfe=rf_model_rfe.predict(x_test_rfe)
# print("\nRFE筛选后随机森林在测试集上的分类报告:")
# print(classification_report(y_test, rf_pred_rfe))
# print("RFE筛选后随机森林在测试集上的混淆矩阵:")
# print(confusion_matrix(y_test, rf_pred_rfe))
##定义与处理步骤
object_cols=x.select_dtypes(include=['object']).columns.tolist()
numeric_cols=x.select_dtypes(exclude=['object']).columns.tolist()
print(f'object_cols:{object_cols}\nnumeric_cols:{numeric_cols}')
ordinal_features=['Years in current job']
ordinal_categories=[['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']] # Years in current job 的顺序 (对应1-11)
ordinal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('encoder',OrdinalEncoder(categories=ordinal_categories,handle_unknown='use_encoded_value',unknown_value=-1))
])
print("有序特征处理 Pipeline 定义完成。")
nominal_features=['Home Ownership', 'Purpose', 'Term']
nominal_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='most_frequent')),
('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])
print("标称特征处理 Pipeline 定义完成。")
continuous_features=x.columns.difference(object_cols).tolist()
print(continuous_features)
continuous_transformer=Pipeline(steps=[
('imputer',SimpleImputer(strategy='mean'))
])
print("连续特征处理 Pipeline 定义完成。")
# --- 构建 ColumnTransformer ---
preprocessor=ColumnTransformer(
transformers=[
('oridinal',ordinal_transformer,ordinal_features),
('nominal',nominal_transformer,nominal_features),
('continuous',continuous_transformer,continuous_features)
],
remainder='drop'
)
print("\nColumnTransformer (预处理器) 定义完成。")
# print(preprocessor) # 可以打印 preprocessor 对象看看它的结构
from imblearn.pipeline import Pipeline as ImbPipeline
pipeline=ImbPipeline(steps=[
('preprocessor',preprocessor),
('smote', SMOTE(random_state=42)),
('classifier',RandomForestClassifier(random_state=42,class_weight='balanced'))
])
print("\n完整的 Pipeline 定义完成。")
# print(pipeline) # 可以打印 pipeline 对象看看它的结构
# --- 使用 Pipeline 在划分好的训练集和测试集上评估 ---
print("\n--- 1. 默认参数随机森林 (训练集 -> 测试集) ---")
start_time = time.time()
pipeline.fit(x_train,y_train)
pipeline_pred=pipeline.predict(x_test)
end_time=time.time()
print(f"训练与预测耗时: {end_time - start_time:.4f} 秒")
print("\n默认随机森林 在测试集上的分类报告:")
print(classification_report(y_test, pipeline_pred))
print("默认随机森林 在测试集上的混淆矩阵:")
print(confusion_matrix(y_test, pipeline_pred))