CCF大数据与计算智能大赛(CCF Big Data & Computing Intelligence Contest,简称CCF BDCI)由中国计算机学会于2013年创办。大赛由国家自然科学基金委员会指导,是大数据与人工智能领域的算法、应用和系统大型挑战赛事。大赛面向重点行业和应用领域征集需求,以前沿技术与行业应用问题为导向,以促进行业发展及产业升级为目标,以众智、众包的方式,汇聚海内外产学研用多方智慧,为社会发现和培养了大量高质量数据人才。
大赛迄今已成功举办八届,累计吸引全球1500余所高校、1800家企事业单位及80余所科研机构的12万余人参与,已成为中国大数据与人工智能领域最具影响力的活动之一,是中国大数据综合赛事第一品牌。
2021年第九届大赛以“数引创新,竞促汇智”为主题,立足余杭、面向全球,于9月至12月举办。大赛将致力于解决来自政府、企业真实场景中的痛点、难点问题,邀请全球优秀团队参与数据资源开发利用,广泛征集信息技术应用解决方案。
先得出一个baseline,提供给大家
#%%
# 构建新的特征 -> model
# 模型进行迁移学习
# 对字段进行筛选+训练模型
#%%
import pandas as pd
#%%
# 读取数据集
train_data = pd.read_csv('./train_public.csv')
submit_example = pd.read_csv('./submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')
#%%
train_data.shape, train_inte.shape
#%%
train_data
#%%
train_data.info()
#%%
train_data['work_year']
#%%
train_data['work_year'].unique()
#%%
work_year_dict = {'< 1 year':0,
'1 year':1,
'2 years':2,
'3 years':3,
'4 years':4,
'5 years':5,
'6 years':6,
'7 years':7,
'8 years':8,
'9 years':9,
'10+ years':10 }
#%%
train_data['work_year'].map(work_year_dict)
#%%
train_data['work_year'] = train_data['work_year'].map(work_year_dict)
#%%
test_public
#%%
test_public['work_year'] = test_public['work_year'].map(work_year_dict)
#%%
train_data['class'].unique()
#%%
train_data['class'].value_counts()
#%%
import seaborn as sns
sns.boxplot(x='class',y='isDefault',data=train_data)
#%%
train_data.groupby('class')['isDefault'].mean()
#%%
class_dict = {'A':1,
'B':2,
'C':3,
'D':4,
'E':5,
'F':6,
'G':7,
}
#%%
train_data['class'] = train_data['class'].map(class_dict)
#%%
test_public['class'] = test_public['class'].map(class_dict)
#%%
train_data.isnull().sum()
#%%
train_data.info()
#%%
train_data['earlies_credit_mon']
#%%
train_data['issue_date']
#%%
# 时间转换
train_data['issue_date'] = pd.to_datetime(train_data['issue_date'])
test_public['issue_date'] = pd.to_datetime(test_public['issue_date'])
#%%
train_data['issue_date']
#%%
# 提取月份
train_data['issue_date_month'] = train_data['issue_date'].dt.month
test_public['issue_date_month'] = test_public['issue_date'].dt.month
#%%
train_data['issue_date_dayofweek'] = train_data['issue_date'].dt.dayofweek
test_public['issue_date_dayofweek'] = test_public['issue_date'].dt.dayofweek
#%%
train_data['issue_date_dayofweek']
#%%
train_data.info()
#%%
train_data.groupby('issue_date_month')['isDefault'].mean().plot(kind='bar')
#%%
train_data.groupby('issue_date_dayofweek')['isDefault'].mean().plot(kind='bar')
#%%
# 导入模型
import lightgbm as lgb
#%%
train_data['employer_type'].value_counts()
#%%
train_data['industry'].value_counts()
#%%
# Onehot编码
cols = ['employer_type','industry']
from sklearn.preprocessing import LabelEncoder
for col in cols:
lbl = LabelEncoder().fit(train_data[col])
train_data[col] = lbl.transform(train_data[col])
test_public[col] = lbl.transform(test_public[col])
#Internet处理
train_inte[col] = lbl.transform(train_inte[col])
#%%
train_data.info()
#%%
train_data['earlies_credit_mon']
#%%
import re
#%%
def findDig(val):
fd = re.search('(\d+-)', val)
if fd is None:
return '1-'+val
return val + '-01'
#%%
#%%
train_data['earlies_credit_mon'] = pd.to_datetime(train_data['earlies_credit_mon'].map(findDig))
test_public['earlies_credit_mon'] = pd.to_datetime(test_public['earlies_credit_mon'].map(findDig))
#%%
train_data['earlies_credit_mon']
#%%
train_data['earliesCreditMon'] = train_data['earlies_credit_mon'].dt.month
test_public['earliesCreditMon'] = test_public['earlies_credit_mon'].dt.month
train_data['earliesCreditYear'] = train_data['earlies_credit_mon'].dt.year
test_public['earliesCreditYear'] = test_public['earlies_credit_mon'].dt.year
#%%
train_data.info()
#%%
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold
from dateutil.relativedelta import relativedelta
#%%
def train_model(data_, test_, y_, folds_):
oof_preds = np.zeros(data_.shape[0])
sub_preds = np.zeros(test_.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
clf = LGBMClassifier(
n_estimators=4000,
learning_rate=0.08,
num_leaves=2**5,
colsample_bytree=.65,
subsample=.9,
max_depth=5,
# max_bin=250,
reg_alpha=.3,
reg_lambda=.3,
min_split_gain=.01,
min_child_weight=2,
silent=-1,
verbose=-1,
)
clf.fit(trn_x, trn_y,
eval_set= [(trn_x, trn_y), (val_x, val_y)],
eval_metric='auc', verbose=100, early_stopping_rounds=40 #30
)
oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
sub_preds += clf.predict_proba(test_[feats], num_iteration=clf.best_iteration_)[:, 1] / folds_.n_splits
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = feats
fold_importance_df["importance"] = clf.feature_importances_
fold_importance_df["fold"] = n_fold + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
del clf, trn_x, trn_y, val_x, val_y
gc.collect()
print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))
test_['isDefault'] = sub_preds
return oof_preds, test_[['loan_id', 'isDefault']], feature_importance_df
#%%
y = train_data['isDefault']
folds = KFold(n_splits=5, shuffle=True, random_state=546789)
#%%
test_public.info()
#%%
test_public.fillna(method='bfill',inplace=True)
#%%
test_public.info()
#%%
train_data.fillna(method='bfill',inplace=True)
#%%
train_data.info()
#%%
col_to_drop = ['issue_date', 'earlies_credit_mon']
train_data = train_data.drop(col_to_drop, axis=1)
test_public = test_public.drop(col_to_drop, axis=1 )
#%%
train_data.info()
#%%
test_public.info()
#%%
train_data = train_data.drop('isDefault', axis=1)
#%%
train_data
#%%
test_public
#%%
y
#%%
oof_preds, test_preds, importances = train_model(train_data, test_public, y, folds)
#%%
#%%
test_preds.rename({'loan_id': 'id'}, axis=1)[['id', 'isDefault']].to_csv('submit.csv', index=False)
#%%
后面代码就不公开了`