|
15 | 15 | FACTOR_UNKNOWN = 'UNKNOWN' |
16 | 16 |
|
17 | 17 |
|
| 18 | +def _build_numeric_expression(split_points, scores, nan_score=None): |
| 19 | + """Build a nested if-else expression for ExpressionTransformer. |
| 20 | +
|
| 21 | + Args: |
| 22 | + split_points (ndarray): split point values |
| 23 | + scores (ndarray): scores array, length = len(split_points) + 1 |
| 24 | + nan_score (float|None): score for NaN values |
| 25 | +
|
| 26 | + Returns: |
| 27 | + str: expression string for ExpressionTransformer |
| 28 | + """ |
| 29 | + n_splits = len(split_points) |
| 30 | + |
| 31 | + if n_splits == 0: |
| 32 | + s = str(float(scores[0])) |
| 33 | + if nan_score is not None: |
| 34 | + return f'{nan_score} if pandas.isnull(X[0]) else {s}' |
| 35 | + return s |
| 36 | + |
| 37 | + parts = [] |
| 38 | + closing = '' |
| 39 | + |
| 40 | + if nan_score is not None: |
| 41 | + parts.append(f'{nan_score} if pandas.isnull(X[0])') |
| 42 | + |
| 43 | + for i in range(n_splits + 1): |
| 44 | + score = float(scores[i]) |
| 45 | + if i == 0: |
| 46 | + if parts: |
| 47 | + parts.append(f' else ({score} if X[0] < {split_points[i]}') |
| 48 | + closing += ')' |
| 49 | + else: |
| 50 | + parts.append(f'{score} if X[0] < {split_points[i]}') |
| 51 | + elif i == n_splits: |
| 52 | + parts.append(f' else {score}') |
| 53 | + else: |
| 54 | + parts.append(f' else ({score} if X[0] < {split_points[i]}') |
| 55 | + closing += ')' |
| 56 | + |
| 57 | + parts.append(closing) |
| 58 | + return ''.join(parts) |
| 59 | + |
| 60 | + |
18 | 61 |
|
19 | 62 | class ScoreCard(BaseEstimator, RulesMixin, BinsMixin): |
20 | 63 | def __init__(self, pdo = 60, rate = 2, base_odds = 35, base_score = 750, |
@@ -377,6 +420,88 @@ def after_export(self, card, to_frame = False, to_json = None, to_csv = None, ** |
377 | 420 | return card |
378 | 421 |
|
379 | 422 |
|
| 423 | + def card2pmml(self, pmml_path='scorecard.pmml', debug=False): |
| 424 | + """Export scorecard to PMML format. |
| 425 | +
|
| 426 | + Args: |
| 427 | + pmml_path (str): path to write the PMML file |
| 428 | + debug (bool): if True, print debug info from sklearn2pmml |
| 429 | +
|
| 430 | + Requires: |
| 431 | + pip install toad[pmml] (sklearn2pmml >= 0.80, sklearn-pandas >= 2.0) |
| 432 | + Java 11+ runtime |
| 433 | + """ |
| 434 | + try: |
| 435 | + from sklearn_pandas import DataFrameMapper |
| 436 | + from sklearn.linear_model import LinearRegression |
| 437 | + from sklearn2pmml import sklearn2pmml, PMMLPipeline |
| 438 | + from sklearn2pmml.preprocessing import LookupTransformer, ExpressionTransformer |
| 439 | + except ImportError as e: |
| 440 | + raise ImportError( |
| 441 | + "card2pmml requires 'sklearn2pmml' and 'sklearn-pandas'. " |
| 442 | + "Install them with: pip install toad[pmml]" |
| 443 | + ) from e |
| 444 | + |
| 445 | + if not self.rules: |
| 446 | + raise RuntimeError( |
| 447 | + "No scorecard rules found. Call fit() or load() before card2pmml()." |
| 448 | + ) |
| 449 | + |
| 450 | + mapper = [] |
| 451 | + for var, rule in self.rules.items(): |
| 452 | + bins = rule['bins'] |
| 453 | + scores = rule['scores'] |
| 454 | + |
| 455 | + if not np.issubdtype(bins.dtype, np.number): |
| 456 | + # Categorical feature |
| 457 | + mapping = {} |
| 458 | + default_value = 0.0 |
| 459 | + for group, score in zip(bins, scores): |
| 460 | + score_f = float(score) |
| 461 | + if isinstance(group, str) and group == self.ELSE_GROUP: |
| 462 | + default_value = score_f |
| 463 | + elif isinstance(group, (list, np.ndarray)): |
| 464 | + for val in group: |
| 465 | + mapping[val] = score_f |
| 466 | + else: |
| 467 | + mapping[group] = score_f |
| 468 | + mapper.append(( |
| 469 | + [var], |
| 470 | + LookupTransformer(mapping=mapping, default_value=default_value), |
| 471 | + )) |
| 472 | + else: |
| 473 | + # Numeric feature |
| 474 | + has_nan = len(bins) > 0 and np.isnan(bins[-1]) |
| 475 | + if has_nan: |
| 476 | + split_points = bins[:-1] |
| 477 | + split_scores = scores[:-1] |
| 478 | + nan_score = float(scores[-1]) |
| 479 | + else: |
| 480 | + split_points = bins |
| 481 | + split_scores = scores |
| 482 | + nan_score = None |
| 483 | + |
| 484 | + expression = _build_numeric_expression( |
| 485 | + split_points, split_scores, nan_score, |
| 486 | + ) |
| 487 | + mapper.append(([var], ExpressionTransformer(expression))) |
| 488 | + |
| 489 | + scorecard_mapper = DataFrameMapper(mapper, df_out=True) |
| 490 | + |
| 491 | + feature_names = list(self.rules.keys()) |
| 492 | + n_features = len(feature_names) |
| 493 | + lr = LinearRegression(fit_intercept=False) |
| 494 | + lr.coef_ = np.ones(n_features) |
| 495 | + lr.intercept_ = 0.0 |
| 496 | + lr.n_features_in_ = n_features |
| 497 | + lr.feature_names_in_ = np.array(feature_names) |
| 498 | + |
| 499 | + pipeline = PMMLPipeline([ |
| 500 | + ('preprocessing', scorecard_mapper), |
| 501 | + ('scorecard', lr), |
| 502 | + ]) |
| 503 | + sklearn2pmml(pipeline, pmml_path, with_repr=True, debug=debug) |
| 504 | + |
380 | 505 |
|
381 | 506 | def _generate_testing_frame(self, maps, size = 'max', mishap = True, gap = 1e-2): |
382 | 507 | """ |
|
0 commit comments