|
@@ -16,19 +16,21 @@ def f_calcu_model_ks(data, y_column, sort_ascending):
|
|
ascending=sort_ascending)
|
|
ascending=sort_ascending)
|
|
var_ks.columns = ['样本数', '坏样本数']
|
|
var_ks.columns = ['样本数', '坏样本数']
|
|
var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
|
|
var_ks['好样本数'] = var_ks['样本数'] - var_ks['坏样本数']
|
|
- var_ks['坏样本比例'] = (var_ks['坏样本数'] / var_ks['样本数']).round(3)
|
|
|
|
- var_ks['样本数比例'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(3)
|
|
|
|
- var_ks['总坏样本数'] = var_ks['坏样本数'].sum()
|
|
|
|
- var_ks['总好样本数'] = var_ks['好样本数'].sum()
|
|
|
|
- var_ks['平均坏样本率'] = (var_ks['总坏样本数'] / var_ks['样本数'].sum()).round(3)
|
|
|
|
|
|
+ var_ks['样本数占比'] = (var_ks['样本数'] / var_ks['样本数'].sum()).round(3)
|
|
|
|
+ var_ks['坏样本率'] = (var_ks['坏样本数'] / var_ks['样本数']).round(3)
|
|
|
|
+ var_ks['平均坏样本率'] = (var_ks['坏样本数'].sum() / var_ks['样本数'].sum()).round(3)
|
|
|
|
+ var_ks['坏样本占所有坏样本的比例'] = (var_ks['坏样本数'] / var_ks['坏样本数'].sum()).round(3)
|
|
|
|
+ var_ks['好样本占所有好样本的比例'] = (var_ks['好样本数'] / var_ks['好样本数'].sum()).round(3)
|
|
var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
|
|
var_ks['累计坏样本数'] = var_ks['坏样本数'].cumsum()
|
|
var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
|
|
var_ks['累计好样本数'] = var_ks['好样本数'].cumsum()
|
|
var_ks['累计样本数'] = var_ks['样本数'].cumsum()
|
|
var_ks['累计样本数'] = var_ks['样本数'].cumsum()
|
|
- var_ks['累计坏样本比例'] = (var_ks['累计坏样本数'] / var_ks['总坏样本数']).round(3)
|
|
|
|
- var_ks['累计好样本比例'] = (var_ks['累计好样本数'] / var_ks['总好样本数']).round(3)
|
|
|
|
- var_ks['KS'] = (var_ks['累计坏样本比例'] - var_ks['累计好样本比例']).round(3)
|
|
|
|
|
|
+ var_ks['累计坏样本占比'] = (var_ks['累计坏样本数'] / var_ks['坏样本数'].sum()).round(3)
|
|
|
|
+ var_ks['累计好样本占比'] = (var_ks['累计好样本数'] / var_ks['好样本数'].sum()).round(3)
|
|
|
|
+ var_ks['KS'] = (var_ks['累计坏样本占比'] - var_ks['累计好样本占比']).round(3)
|
|
var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(3)
|
|
var_ks['LIFT'] = ((var_ks['累计坏样本数'] / var_ks['累计样本数']) / var_ks['平均坏样本率']).round(3)
|
|
- return var_ks.reset_index()
|
|
|
|
|
|
+ var_ks = var_ks.reset_index()
|
|
|
|
+ return var_ks[[ConstantEnum.SCORE_BIN.value, "样本数", "样本数占比", "好样本数", "好样本占所有好样本的比例",
|
|
|
|
+ "累计好样本占比", "坏样本数", "坏样本占所有坏样本的比例", "累计坏样本占比", "坏样本率", "KS", "LIFT"]]
|
|
|
|
|
|
|
|
|
|
def f_get_model_score_bin(df, score, bins=None):
|
|
def f_get_model_score_bin(df, score, bins=None):
|
|
@@ -47,18 +49,18 @@ def f_get_model_score_bin(df, score, bins=None):
|
|
def f_calcu_model_psi(df_train, df_test, sort_ascending=True):
|
|
def f_calcu_model_psi(df_train, df_test, sort_ascending=True):
|
|
tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
|
|
tmp1 = df_train.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
|
|
ascending=sort_ascending)
|
|
ascending=sort_ascending)
|
|
- tmp1['样本数比例'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
|
|
|
|
|
|
+ tmp1['样本数占比'] = (tmp1['count'] / tmp1['count'].sum()).round(3)
|
|
tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
|
|
tmp2 = df_test.groupby(ConstantEnum.SCORE_BIN.value)[ConstantEnum.SCORE_BIN.value].agg(['count']).sort_index(
|
|
ascending=sort_ascending)
|
|
ascending=sort_ascending)
|
|
- tmp2['样本数比例'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
|
|
|
|
- psi = ((tmp1['样本数比例'] - tmp2['样本数比例']) * np.log(tmp1['样本数比例'] / tmp2['样本数比例'])).round(3)
|
|
|
|
|
|
+ tmp2['样本数占比'] = (tmp2['count'] / tmp2['count'].sum()).round(3)
|
|
|
|
+ psi = ((tmp1['样本数占比'] - tmp2['样本数占比']) * np.log(tmp1['样本数占比'] / tmp2['样本数占比'])).round(3)
|
|
psi = psi.reset_index()
|
|
psi = psi.reset_index()
|
|
- psi = psi.rename(columns={"样本数比例": "psi"})
|
|
|
|
|
|
+ psi = psi.rename(columns={"样本数占比": "psi"})
|
|
psi['训练样本数'] = list(tmp1['count'])
|
|
psi['训练样本数'] = list(tmp1['count'])
|
|
- psi['测试样本数'] = list(tmp2['count'])
|
|
|
|
- psi['训练样本数比例'] = list(tmp1['样本数比例'])
|
|
|
|
- psi['测试样本数比例'] = list(tmp2['样本数比例'])
|
|
|
|
- return psi
|
|
|
|
|
|
+ psi['验证样本数'] = list(tmp2['count'])
|
|
|
|
+ psi['训练样本占比'] = list(tmp1['样本数占比'])
|
|
|
|
+ psi['验证样本占比'] = list(tmp2['样本数占比'])
|
|
|
|
+ return psi[[ConstantEnum.SCORE_BIN.value, "训练样本数", "训练样本占比", "验证样本数", "验证样本占比", "psi"]]
|
|
|
|
|
|
|
|
|
|
def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, target_column: str, score_column: str,
|
|
def f_stress_test(df: pd.DataFrame, sample_times: int, bad_rate_list: list, target_column: str, score_column: str,
|