{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "72fcd4bd-020f-4642-8286-3c96a133f1b0", "metadata": { "pycharm": { "name": "#%%\n", "is_executing": true } }, "outputs": [], "source": [ "%matplotlib agg\n", "import sys\n", "sys.path.append(\"/root/project\")\n", "from easy_ml import DataSplitEntity, FilterStrategyFactory, ModelFactory,Pipeline\n" ] }, { "cell_type": "code", "execution_count": null, "id": "263a6e1a-2ac7-4ec5-a100-e0539576e04c", "metadata": { "pycharm": { "name": "#%%\n", "is_executing": true } }, "outputs": [], "source": [ "# 加载demo数据\n", "import scorecardpy as sc\n", "dat = sc.germancredit()\n", "dat_columns = dat.columns.tolist()\n", "dat_columns = [c.replace(\".\",\"_\") for c in dat_columns]\n", "dat.columns = dat_columns\n", "dat[\"creditability\"] = dat[\"creditability\"].apply(lambda x: 1 if x == \"bad\" else 0)" ] }, { "cell_type": "code", "execution_count": 3, "id": "2ff2a864-438e-4524-b1ed-20003bba498a", "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 9.35it/s]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
样本样本数样本占比坏样本数坏样本比例
0训练集70970.90%21129.76%
1测试集29129.10%8930.58%
2合计1000100%30030.00%
\n", "
" ], "text/plain": [ " 样本 样本数 样本占比 坏样本数 坏样本比例\n", "0 训练集 709 70.90% 211 29.76%\n", "1 测试集 291 29.10% 89 30.58%\n", "2 合计 1000 100% 300 30.00%" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
变量IVpsi
0credit_history0.27220.0156
1duration_in_month0.25090.0271
2purpose0.15190.0126
3credit_amount0.14470.0204
4savings_account_and_bonds0.14450.0154
5age_in_years0.07160.0039
\n", "
" ], "text/plain": [ " 变量 IV psi\n", "0 credit_history 0.2722 0.0156\n", "1 duration_in_month 0.2509 0.0271\n", "2 purpose 0.1519 0.0126\n", "3 credit_amount 0.1447 0.0204\n", "4 savings_account_and_bonds 0.1445 0.0154\n", "5 age_in_years 0.0716 0.0039" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
变量vif
0age_in_years_woe1.028369
1purpose_woe1.015325
2credit_history_woe1.030331
3credit_amount_woe1.071660
4duration_in_month_woe1.106351
5savings_account_and_bonds_woe1.018661
\n", "
" ], "text/plain": [ " 变量 vif\n", "0 age_in_years_woe 1.028369\n", "1 purpose_woe 1.015325\n", "2 credit_history_woe 1.030331\n", "3 credit_amount_woe 1.071660\n", "4 duration_in_month_woe 1.106351\n", "5 savings_account_and_bonds_woe 1.018661" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "

变量趋势训练集

变量趋势测试集

" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "变量切分点:\n", "{\n", " \"credit_history\": [\n", " \"no credits taken/ all credits paid back duly%,%all credits at this bank paid back duly\",\n", " \"existing credits paid back duly till now\",\n", " \"delay in paying off in the past\",\n", " \"critical account/ other credits existing (not at this bank)\"\n", " ],\n", " \"savings_account_and_bonds\": [\n", " \"... < 100 DM%,%100 <= ... < 500 DM\",\n", " \"500 <= ... < 1000 DM%,%... >= 1000 DM\",\n", " \"unknown/ no savings account\"\n", " ],\n", " \"duration_in_month\": [\n", " 12,\n", " 18,\n", " 48\n", " ],\n", " \"purpose\": [\n", " \"retraining%,%car (used)\",\n", " \"radio/television\",\n", " \"furniture/equipment%,%business%,%repairs\",\n", " \"domestic appliances%,%education%,%car (new)%,%others\"\n", " ],\n", " \"credit_amount\": [\n", " 2000,\n", " 3500,\n", " 4000,\n", " 7000\n", " ],\n", " \"age_in_years\": [\n", " 27,\n", " 34,\n", " 58\n", " ]\n", "}\n", "-----模型结果-----\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
样本集AUCKS
0训练集0.76170.4182
1测试集0.78500.4977
\n", "
" ], "text/plain": [ " 样本集 AUC KS\n", "0 训练集 0.7617 0.4182\n", "1 测试集 0.7850 0.4977" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MODEL_SCORE_BINpsi训练样本数测试样本数训练样本数比例测试样本数比例
0(-inf, 521.0]0.000072290.10160.0997
1(521.0, 552.0]0.000071290.10010.0997
2(552.0, 578.0]0.000072300.10160.1031
3(578.0, 596.0]0.002883290.11710.0997
4(596.0, 611.0]0.004857180.08040.0619
5(611.0, 630.0]0.004979260.11140.0893
6(630.0, 649.0]0.000066270.09310.0928
7(649.0, 675.4]0.016167400.09450.1375
8(675.4, 706.4]0.000071290.10010.0997
9(706.4, inf]0.002671340.10010.1168
\n", "
" ], "text/plain": [ " MODEL_SCORE_BIN psi 训练样本数 测试样本数 训练样本数比例 测试样本数比例\n", "0 (-inf, 521.0] 0.0000 72 29 0.1016 0.0997\n", "1 (521.0, 552.0] 0.0000 71 29 0.1001 0.0997\n", "2 (552.0, 578.0] 0.0000 72 30 0.1016 0.1031\n", "3 (578.0, 596.0] 0.0028 83 29 0.1171 0.0997\n", "4 (596.0, 611.0] 0.0048 57 18 0.0804 0.0619\n", "5 (611.0, 630.0] 0.0049 79 26 0.1114 0.0893\n", "6 (630.0, 649.0] 0.0000 66 27 0.0931 0.0928\n", "7 (649.0, 675.4] 0.0161 67 40 0.0945 0.1375\n", "8 (675.4, 706.4] 0.0000 71 29 0.1001 0.0997\n", "9 (706.4, inf] 0.0026 71 34 0.1001 0.1168" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "模型psi: 0.0312\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
变量变量系数
0age_in_years_woe0.739076
1purpose_woe1.161895
2credit_history_woe0.896308
3credit_amount_woe0.742090
4duration_in_month_woe0.811411
5savings_account_and_bonds_woe0.910304
\n", "
" ], "text/plain": [ " 变量 变量系数\n", "0 age_in_years_woe 0.739076\n", "1 purpose_woe 1.161895\n", "2 credit_history_woe 0.896308\n", "3 credit_amount_woe 0.742090\n", "4 duration_in_month_woe 0.811411\n", "5 savings_account_and_bonds_woe 0.910304" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "-----训练集-分数分箱-----\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MODEL_SCORE_BIN样本数坏样本数好样本数坏样本比例样本数比例总坏样本数总好样本数平均坏样本率累计坏样本数累计好样本数累计样本数累计坏样本比例累计好样本比例KSLIFT
0(-inf, 521.0]7245270.62500.10162114980.29764527720.21330.05420.15912.1001
1(521.0, 552.0]7134370.47890.10012114980.297679641430.37440.12850.24591.8563
2(552.0, 578.0]7235370.48610.10162114980.29761141012150.54030.20280.33751.7817
3(578.0, 596.0]8332510.38550.11712114980.29761461522980.69190.30520.38671.6463
4(596.0, 611.0]5717400.29820.08042114980.29761631923550.77250.38550.38701.5429
5(611.0, 630.0]7923560.29110.11142114980.29761862484340.88150.49800.38351.4401
6(630.0, 649.0]668580.12120.09312114980.29761943065000.91940.61450.30491.3038
7(649.0, 675.4]6710570.14930.09452114980.29762043635670.96680.72890.23791.2090
8(675.4, 706.4]715660.07040.10012114980.29762094296380.99050.86140.12911.1008
9(706.4, inf]712690.02820.10012114980.29762114987091.00001.00000.00001.0000
\n", "
" ], "text/plain": [ " MODEL_SCORE_BIN 样本数 坏样本数 好样本数 坏样本比例 样本数比例 总坏样本数 总好样本数 平均坏样本率 \\\n", "0 (-inf, 521.0] 72 45 27 0.6250 0.1016 211 498 0.2976 \n", "1 (521.0, 552.0] 71 34 37 0.4789 0.1001 211 498 0.2976 \n", "2 (552.0, 578.0] 72 35 37 0.4861 0.1016 211 498 0.2976 \n", "3 (578.0, 596.0] 83 32 51 0.3855 0.1171 211 498 0.2976 \n", "4 (596.0, 611.0] 57 17 40 0.2982 0.0804 211 498 0.2976 \n", "5 (611.0, 630.0] 79 23 56 0.2911 0.1114 211 498 0.2976 \n", "6 (630.0, 649.0] 66 8 58 0.1212 0.0931 211 498 0.2976 \n", "7 (649.0, 675.4] 67 10 57 0.1493 0.0945 211 498 0.2976 \n", "8 (675.4, 706.4] 71 5 66 0.0704 0.1001 211 498 0.2976 \n", "9 (706.4, inf] 71 2 69 0.0282 0.1001 211 498 0.2976 \n", "\n", " 累计坏样本数 累计好样本数 累计样本数 累计坏样本比例 累计好样本比例 KS LIFT \n", "0 45 27 72 0.2133 0.0542 0.1591 2.1001 \n", "1 79 64 143 0.3744 0.1285 0.2459 1.8563 \n", "2 114 101 215 0.5403 0.2028 0.3375 1.7817 \n", "3 146 152 298 0.6919 0.3052 0.3867 1.6463 \n", "4 163 192 355 0.7725 0.3855 0.3870 1.5429 \n", "5 186 248 434 0.8815 0.4980 0.3835 1.4401 \n", "6 194 306 500 0.9194 0.6145 0.3049 1.3038 \n", "7 204 363 567 0.9668 0.7289 0.2379 1.2090 \n", "8 209 429 638 0.9905 0.8614 0.1291 1.1008 \n", "9 211 498 709 1.0000 1.0000 0.0000 1.0000 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "-----测试集-分数分箱-----\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
MODEL_SCORE_BIN样本数坏样本数好样本数坏样本比例样本数比例总坏样本数总好样本数平均坏样本率累计坏样本数累计好样本数累计样本数累计坏样本比例累计好样本比例KSLIFT
0(-inf, 521.0]292180.72410.0997892020.3058218290.23600.03960.19642.3680
1(521.0, 552.0]2916130.55170.0997892020.30583721580.41570.10400.31172.0861
2(552.0, 578.0]3014160.46670.1031892020.30585137880.57300.18320.38981.8952
3(578.0, 596.0]2914150.48280.0997892020.305865521170.73030.25740.47291.8167
4(596.0, 611.0]184140.22220.0619892020.305869661350.77530.32670.44861.6714
5(611.0, 630.0]267190.26920.0893892020.305876851610.85390.42080.43311.5437
6(630.0, 649.0]273240.11110.0928892020.3058791091880.88760.53960.34801.3741
7(649.0, 675.4]406340.15000.1375892020.3058851432280.95510.70790.24721.2191
8(675.4, 706.4]292270.06900.0997892020.3058871702570.97750.84160.13591.1070
9(706.4, inf]342320.05880.1168892020.3058892022911.00001.00000.00001.0001
\n", "
" ], "text/plain": [ " MODEL_SCORE_BIN 样本数 坏样本数 好样本数 坏样本比例 样本数比例 总坏样本数 总好样本数 平均坏样本率 \\\n", "0 (-inf, 521.0] 29 21 8 0.7241 0.0997 89 202 0.3058 \n", "1 (521.0, 552.0] 29 16 13 0.5517 0.0997 89 202 0.3058 \n", "2 (552.0, 578.0] 30 14 16 0.4667 0.1031 89 202 0.3058 \n", "3 (578.0, 596.0] 29 14 15 0.4828 0.0997 89 202 0.3058 \n", "4 (596.0, 611.0] 18 4 14 0.2222 0.0619 89 202 0.3058 \n", "5 (611.0, 630.0] 26 7 19 0.2692 0.0893 89 202 0.3058 \n", "6 (630.0, 649.0] 27 3 24 0.1111 0.0928 89 202 0.3058 \n", "7 (649.0, 675.4] 40 6 34 0.1500 0.1375 89 202 0.3058 \n", "8 (675.4, 706.4] 29 2 27 0.0690 0.0997 89 202 0.3058 \n", "9 (706.4, inf] 34 2 32 0.0588 0.1168 89 202 0.3058 \n", "\n", " 累计坏样本数 累计好样本数 累计样本数 累计坏样本比例 累计好样本比例 KS LIFT \n", "0 21 8 29 0.2360 0.0396 0.1964 2.3680 \n", "1 37 21 58 0.4157 0.1040 0.3117 2.0861 \n", "2 51 37 88 0.5730 0.1832 0.3898 1.8952 \n", "3 65 52 117 0.7303 0.2574 0.4729 1.8167 \n", "4 69 66 135 0.7753 0.3267 0.4486 1.6714 \n", "5 76 85 161 0.8539 0.4208 0.4331 1.5437 \n", "6 79 109 188 0.8876 0.5396 0.3480 1.3741 \n", "7 85 143 228 0.9551 0.7079 0.2472 1.2191 \n", "8 87 170 257 0.9775 0.8416 0.1359 1.1070 \n", "9 89 202 291 1.0000 1.0000 0.0000 1.0001 " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
variablebinpoints
0basepointsNaN599.0
0age_in_years[-inf,27.0)-16.0
1age_in_years[27.0,34.0)-8.0
2age_in_years[34.0,58.0)17.0
3age_in_years[58.0,inf)-9.0
4purposeretraining%,%car (used)73.0
5purposeradio/television31.0
6purposefurniture/equipment%,%business%,%repairs-12.0
7purposedomestic appliances%,%education%,%car (new)%,%...-32.0
8credit_historyno credits taken/ all credits paid back duly%,...-84.0
9credit_historyexisting credits paid back duly till now-3.0
10credit_historydelay in paying off in the past-9.0
11credit_historycritical account/ other credits existing (not ...43.0
12credit_amount[-inf,2000.0)5.0
13credit_amount[2000.0,3500.0)14.0
14credit_amount[3500.0,4000.0)50.0
15credit_amount[4000.0,7000.0)-21.0
16credit_amount[7000.0,inf)-40.0
17duration_in_month[-inf,12.0)58.0
18duration_in_month[12.0,18.0)12.0
19duration_in_month[18.0,48.0)-15.0
20duration_in_month[48.0,inf)-53.0
21savings_account_and_bonds... < 100 DM%,%100 <= ... < 500 DM-14.0
22savings_account_and_bonds500 <= ... < 1000 DM%,%... >= 1000 DM62.0
23savings_account_and_bondsunknown/ no savings account31.0
\n", "
" ], "text/plain": [ " variable \\\n", "0 basepoints \n", "0 age_in_years \n", "1 age_in_years \n", "2 age_in_years \n", "3 age_in_years \n", "4 purpose \n", "5 purpose \n", "6 purpose \n", "7 purpose \n", "8 credit_history \n", "9 credit_history \n", "10 credit_history \n", "11 credit_history \n", "12 credit_amount \n", "13 credit_amount \n", "14 credit_amount \n", "15 credit_amount \n", "16 credit_amount \n", "17 duration_in_month \n", "18 duration_in_month \n", "19 duration_in_month \n", "20 duration_in_month \n", "21 savings_account_and_bonds \n", "22 savings_account_and_bonds \n", "23 savings_account_and_bonds \n", "\n", " bin points \n", "0 NaN 599.0 \n", "0 [-inf,27.0) -16.0 \n", "1 [27.0,34.0) -8.0 \n", "2 [34.0,58.0) 17.0 \n", "3 [58.0,inf) -9.0 \n", "4 retraining%,%car (used) 73.0 \n", "5 radio/television 31.0 \n", "6 furniture/equipment%,%business%,%repairs -12.0 \n", "7 domestic appliances%,%education%,%car (new)%,%... -32.0 \n", "8 no credits taken/ all credits paid back duly%,... -84.0 \n", "9 existing credits paid back duly till now -3.0 \n", "10 delay in paying off in the past -9.0 \n", "11 critical account/ other credits existing (not ... 43.0 \n", "12 [-inf,2000.0) 5.0 \n", "13 [2000.0,3500.0) 14.0 \n", "14 [3500.0,4000.0) 50.0 \n", "15 [4000.0,7000.0) -21.0 \n", "16 [7000.0,inf) -40.0 \n", "17 [-inf,12.0) 58.0 \n", "18 [12.0,18.0) 12.0 \n", "19 [18.0,48.0) -15.0 \n", "20 [48.0,inf) -53.0 \n", "21 ... < 100 DM%,%100 <= ... < 500 DM -14.0 \n", "22 500 <= ... < 1000 DM%,%... >= 1000 DM 62.0 \n", "23 unknown/ no savings account 31.0 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "模型报告文件储存路径:./cache/train/2025_01_03_14_38_35/模型报告.docx\n" ] } ], "source": [ "data = DataSplitEntity(train_data=dat[:709], test_data=dat[709:])\n", "# 特征处理\n", "cfg = {\n", "# jupyter下输出内容\n", "\"jupyter_print\": True,\n", "# 是否开启粗分箱\n", "\"format_bin\": False,\n", "# 变量切分点搜索采样率\n", "\"sample_rate\": 0.1,\n", "# 最多候选变量数\n", "\"max_feature_num\": 10,\n", "# 单调性允许变化次数\n", "\"monto_shift_threshold\":1,\n", "# 特殊值\n", "# \"special_values\": {\"age_in_years\": [36]},\n", "# 手动定义切分点,字符型的变量以'%,%'合并枚举值\n", "\"breaks_list\": { 'duration_in_month': [12, 18, 48], \n", " 'credit_amount': [2000, 3500, 4000, 7000], \n", " 'purpose': ['retraining%,%car (used)', 'radio/television', 'furniture/equipment%,%business%,%repairs', 'domestic appliances%,%education%,%car (new)%,%others'], \n", " 'age_in_years': [27, 34, 58]},\n", "# y \n", "\"y_column\": \"creditability\",\n", "# 候选变量\n", "\"x_columns\": [\n", "\"duration_in_month\",\n", "\"credit_amount\",\n", "\"age_in_years\",\n", "\"purpose\",\n", "\"credit_history\",\n", "\"savings_account_and_bonds\"\n", " ]\n", "}\n", "\n", "# 选择特征筛选策略\n", "feature_strategy_clazz = FilterStrategyFactory.get_strategy(\"iv\")\n", "feature_strategy = feature_strategy_clazz(**cfg)\n", "\n", "\n", "# 选择模型\n", "model_clazz = ModelFactory.get_model(\"lr\")\n", "model = model_clazz()\n", "\n", "# 训练并生成报告\n", "train_pipeline = Pipeline(feature_strategy, model, data)\n", "train_pipeline.train()\n", "train_pipeline.report()" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:easy_ml]", "language": "python", "name": "conda-env-easy_ml-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }