2022高教社杯全国大学生数学建模竞赛C题问题一(3) Python代码

- 1.3 根据风化点检测数据，预测其风化前的化学成分含量
- - 数据重塑
  - 数据可视化
  - 回归
  - - 随机森林回归
    - XGboost回归
    - Gradient Boosting回归
    - LightGBM回归
    - CatBoost回归
    - 决策树回归
    - MLP回归
  - 预测
  - 数据复原

1.3 根据风化点检测数据，预测其风化前的化学成分含量

在这里插入图片描述

数据重塑

import numpy as npdf= pd.DataFrame(columns=['文物编号','风化标记', '化学成分含量', '化学成分标签','纹饰','类型','颜色','表面风化'], index=range(d12.shape[0]*14))
df['文物编号'] = list(d12['文物编号']) * 14
df['风化标记'] = list(d12['风化标记']) * 14
df['纹饰'] = list(d12['纹饰']) * 14
df['类型'] = list(d12['类型']) * 14
df['颜色'] = list(d12['颜色']) * 14
df['表面风化'] = list(d12['表面风化']) * 14
df['化学成分标签'] = list(np.repeat(list(d12.columns[6:20]), d12.shape[0]))a = list(d12.iloc[:,6])
for i in range(7,20):a.extend(d12.iloc[:,i])
df['化学成分含量'] = a
df.head()

	文物编号	风化标记	化学成分含量	化学成分标签	纹饰	类型	颜色	表面风化
0	1	其它	69.33	二氧化硅(SiO2)	C	高钾	蓝绿	无风化
1	2	其它	36.28	二氧化硅(SiO2)	A	铅钡	浅蓝	风化
2	3	其它	87.05	二氧化硅(SiO2)	A	高钾	蓝绿	无风化
3	3	其它	61.71	二氧化硅(SiO2)	A	高钾	蓝绿	无风化
4	4	其它	65.88	二氧化硅(SiO2)	A	高钾	蓝绿	无风化

数据可视化

import plotly.express as px
fig = px.box(df, x="化学成分标签", y="化学成分含量", color="风化标记")
# remove background color
fig.update_layout({'plot_bgcolor': 'rgba(0, 0, 0, 0)',})
fig.show()

在这里插入图片描述

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warningsfrom sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_scorewarnings.filterwarnings('ignore')

# 颜色 rows with na
# 由于在风化及未风化的玻璃中，均是‘浅蓝’颜色的玻璃频数最高，这里选择使用众数进行缺失值填补
index = pd.isna(df['颜色'])
index = np.where(index)[0]
df.iloc[index,6] = '浅蓝'

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 938 entries, 0 to 937
Data columns (total 8 columns):#   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  0   文物编号    938 non-null    int64  1   风化标记    938 non-null    object 2   化学成分含量  938 non-null    float643   化学成分标签  938 non-null    object 4   纹饰      938 non-null    object 5   类型      938 non-null    object 6   颜色      938 non-null    object 7   表面风化    938 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 58.8+ KB

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder# reorder columns
df = df.iloc[:,[0,2,1] + list(range(3,8))]# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = df.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = df.select_dtypes(exclude=['object']).valuesdf_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)
df_encode.rename(columns = {0:'文物编号', 1:'化学成分含量'}, inplace = True)
df_encode.head()

	文物编号	化学成分含量	风化标记	纹饰	类型	颜色	表面风化
0	1.0	69.33	1	2	1	6	0
1	2.0	36.28	1	0	0	1	1
2	3.0	87.05	1	0	1	6	0
3	3.0	61.71	1	0	1	6	0
4	4.0	65.88	1	0	1	6	0

X = df_encode.drop('化学成分含量', axis=1)
y = df_encode['化学成分含量']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

回归

随机森林回归

https://www.geeksforgeeks.org/random-forest-regression-in-python/

# Fitting Random Forest Regression to the dataset
RF = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)# Fit the regressor with x and y data
RF.fit(X_train, y_train)# Evaluating the model
from sklearn.metrics import mean_squared_error, r2_score# Access the OOB Score
oob_score = RF.oob_score_
print(f'Out-of-Bag Score: {oob_score}')# Making predictions on the same data or new data
predictions = RF.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Out-of-Bag Score: 0.858876239921834
Mean Squared Error: 18.621843142992024
R-squared: 0.9440368907222724

XGboost回归

import numpy as np 
import pandas as pd 
import xgboost as xg 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error as MSE

xgb_r = xg.XGBRegressor(objective ='reg:linear', n_estimators = 10, seed = 123)
xgb_r.fit(X_train, y_train)

[14:07:55] WARNING: /workspace/src/objective/regression_obj.cu:167: reg:linear is now deprecated in favor of reg:squarederror.XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,importance_type='gain', interaction_constraints=None,learning_rate=0.300000012, max_delta_step=0, max_depth=6,min_child_weight=1, missing=nan, monotone_constraints=None,n_estimators=10, n_jobs=0, num_parallel_tree=1,objective='reg:linear', random_state=123, reg_alpha=0,reg_lambda=1, scale_pos_weight=1, seed=123, subsample=1,tree_method=None, validate_parameters=False, verbosity=None)

predictions = xgb_r.predict(X_test) # Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 31.125352266521848
R-squared: 0.9064608440301115

Gradient Boosting回归

#importing libraries  
import pandas as pd 
import numpy as np 
import seaborn as sb 
import matplotlib.pyplot as plt 
import lightgbm as lgb from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

from sklearn.ensemble import GradientBoostingRegressorGB = GradientBoostingRegressor()
GB.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = GB.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 44.14340281131733
R-squared: 0.8673384768386802

LightGBM回归

from lightgbm import LGBMRegressorgbm = LGBMRegressor()
gbm.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 89
[LightGBM] [Info] Number of data points in the train set: 750, number of used features: 7
[LightGBM] [Info] Start training from score 6.549480
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -infLGBMRegressor()

# Making predictions on the same data or new data
predictions = gbm.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 38.507655924247906
R-squared: 0.8842752492344551

CatBoost回归

import catboost
from catboost import CatBoostRegressorcat = CatBoostRegressor(verbose=0, n_estimators=100)
cat.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = cat.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 19.8730131181204
R-squared: 0.94027682457278

决策树回归

from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = tree.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 18.104645212765963
R-squared: 0.9455911946687293

MLP回归

from sklearn.neural_network import MLPRegressormlp = MLPRegressor(random_state=0)
mlp.fit(X_train, y_train)# Making predictions on the same data or new data
predictions = mlp.predict(X_test)# Evaluating the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2}')

Mean Squared Error: 213.8644123666194
R-squared: 0.3572860974079113

预测

# generate test data
df_pred = df_encode[df_encode['表面风化'] == 1]
df_pred_origin = df[df_encode['表面风化'] == 1]# 设置风化前的数据
df_pred['表面风化'] = 0
X_pred = df_pred.drop('化学成分含量', axis=1)
X_pred

	文物编号	风化标记	化学成分标签	纹饰	类型	颜色	表面风化
1	2.0	1	0	0	0	1	0
8	7.0	1	0	1	1	6	0
9	8.0	1	0	2	0	4	0
10	8.0	0	0	2	0	4	0
11	9.0	1	0	1	1	6	0
...	...	...	...	...	...	...	...
932	54.0	1	1	2	0	1	0
933	54.0	0	1	2	0	1	0
935	56.0	1	1	2	0	6	0
936	57.0	1	1	2	0	6	0
937	58.0	1	1	2	0	1	0

588 rows × 7 columns

predictions = RF.predict(X_pred)

df_pred_origin = df_pred_origin.drop('表面风化', axis=1)
df_pred_origin['风化前预测'] = predictions
df_pred_origin.head()

	文物编号	化学成分含量	风化标记	化学成分标签	纹饰	类型	颜色	风化前预测
1	2	36.28	其它	二氧化硅(SiO2)	A	铅钡	浅蓝	41.3435
8	7	92.63	其它	二氧化硅(SiO2)	B	高钾	蓝绿	60.9480
9	8	20.14	其它	二氧化硅(SiO2)	C	铅钡	紫	37.7020
10	8	4.61	严重风化点	二氧化硅(SiO2)	C	铅钡	紫	28.9610
11	9	95.02	其它	二氧化硅(SiO2)	B	高钾	蓝绿	64.2920

数据复原

# for the samples that have two testing points, we consider the mean of the predicted values
dual = list(df_pred['文物编号'].value_counts().index[0:8])
labels = list(df_pred_origin['化学成分标签'].unique())
pre_mean = pd.DataFrame(columns=['文物编号','化学成分标签','风化前预测'], index=range(len(dual) * len(labels)))
pre_mean['文物编号'] = list(np.repeat(dual, len(labels)))
pre_mean['化学成分标签'] = labels * len(dual)for i in dual:i = int(i)for j in labels:if dual[0] in list(df_pred_origin['文物编号'].unique()):index = np.where((df_pred_origin['文物编号'] == i) & (df_pred_origin['化学成分标签'] == j))[0]pre_mean.iloc[dual.index(i)*len(labels) + labels.index(j),2] = df_pred_origin.iloc[index,7].mean()

my_index = []
for i in list(df_pred_origin.iloc[:,0]):if i not in dual:my_index.append(i)

df_pred_origin.index = range(df_pred_origin.shape[0])
my_index = []
for i in list(df_pred_origin.index):if df_pred_origin.iloc[i,0] not in dual:my_index.append(i)

df_pred_origin_sub1 = df_pred_origin.iloc[my_index,[0,3,7]]
df_pred_origin_sub1.head()

	文物编号	化学成分标签	风化前预测
0	2	二氧化硅(SiO2)	41.3435
1	7	二氧化硅(SiO2)	60.9480
4	9	二氧化硅(SiO2)	64.2920
5	10	二氧化硅(SiO2)	64.2120
6	11	二氧化硅(SiO2)	41.2345

pre_mean.head()

	文物编号	化学成分标签	风化前预测
0	50.0	二氧化硅(SiO2)	51.22975
1	50.0	氧化钠(Na2O)	0.694
2	50.0	氧化钾(K2O)	0.033
3	50.0	氧化钙(CaO)	2.6215
4	50.0	氧化镁(MgO)	0.7615

df_merge = pd.concat([df_pred_origin_sub1, pre_mean])
df_merge = df_merge.sort_values(['化学成分标签', '文物编号'], ascending=[True, True])
df_merge

	文物编号	化学成分标签	风化前预测
0	2.0	二氧化硅(SiO2)	41.3435
1	7.0	二氧化硅(SiO2)	60.948
28	8.0	二氧化硅(SiO2)	33.3315
4	9.0	二氧化硅(SiO2)	64.292
5	10.0	二氧化硅(SiO2)	64.212
...	...	...	...
204	53.0	氧化镁(MgO)	1.02
60	54.0	氧化镁(MgO)	0.8755
207	56.0	氧化镁(MgO)	0.0
208	57.0	氧化镁(MgO)	0.0
209	58.0	氧化镁(MgO)	0.743

476 rows × 3 columns

nrow = len(df_merge['文物编号'].unique())
ncol = len(df_merge['化学成分标签'].unique())df_shape = np.array(df_merge['风化前预测']).reshape(ncol, nrow)
df_results = pd.DataFrame(np.transpose(df_shape), columns = list(df_merge.iloc[:,1].unique()),index = list(df_merge.iloc[:,0].unique()))

columns_order = list(d2.columns)[1:15]
df_results = df_results.reindex(columns=columns_order)

for i in range(df_results.shape[0]):df_results.iloc[i,:] = df_results.iloc[i,:] / list(df_results.sum(axis=1) / 100)[i]

df_results.to_csv('/home/shiyu/Desktop/path_acdemic/ant/数模/历年题目/2022/output/df_results.csv', index=True)

2022高教社杯全国大学生数学建模竞赛C题问题一(3) Python代码

目录

1.3 根据风化点检测数据，预测其风化前的化学成分含量

数据重塑

数据可视化

回归

随机森林回归

XGboost回归

Gradient Boosting回归

LightGBM回归

CatBoost回归

决策树回归

MLP回归

预测

数据复原

相关资讯

热文排行

最新新闻

推荐新闻

热搜词

2022高教社杯全国大学生数学建模竞赛C题 问题一(3) Python代码

目录

1.3 根据风化点检测数据，预测其风化前的化学成分含量

数据重塑

数据可视化

回归

随机森林回归

XGboost回归

Gradient Boosting回归

LightGBM回归

CatBoost回归

决策树回归

MLP回归

预测

数据复原

相关资讯

热文排行

最新新闻

推荐新闻

热搜词

2022高教社杯全国大学生数学建模竞赛C题问题一(3) Python代码