============
Usage: Classifier
============

.. code:: python

    from cosmic_toolbox import arraytools as at, file_utils
    import numpy as np
    from scipy.special import expit
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split

    from edelweiss.classifier import (
        Classifier,
        MultiClassifier,
        MultiClassClassifier,
        load_classifier,
        load_multiclassifier,
    )

    np.random.seed(1996)

Classifier
==========

.. code:: python

    # Load the dataset and split
    data = load_breast_cancer()
    X = at.arr2rec(data.data, names=data["feature_names"])
    y = data.target
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

.. code:: python

    # select scaling method, classifier, if to use cross validation, and if the classifier should be calibrated
    clf = Classifier(scaler="robust", clf="XGB", cv=3, cv_scoring="f1", calibrate=True)
    clf.train(X_train, y_train)
    clf.test(X_test, y_test)


.. parsed-literal::

    24-07-25 14:34:16 classifier INF   Training this model:
    24-07-25 14:34:16 classifier INF   CalibratedClassifierCV
    24-07-25 14:34:16 classifier INF   scaler:
    24-07-25 14:34:16 classifier INF   RobustScaler()
    24-07-25 14:34:16 classifier INF   clf:
    24-07-25 14:34:16 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, device=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=None, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=None, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=None, n_jobs=None,
                  num_parallel_tree=None, random_state=None, ...)
    24-07-25 14:34:16 classifier INF   number of samples: 398
    24-07-25 14:34:16 classifier INF   -------------------
    24-07-25 14:34:16 classifier INF   Start cross validation
    24-07-25 14:34:16 classifier INF   Running the Grid search on 1 jobs
    24-07-25 14:34:31 classifier INF   Best parameters found by grid search: {'clf__learning_rate': 0.5, 'clf__max_depth': 3, 'clf__n_estimators': 10}
    24-07-25 14:34:31 classifier INF   Training completed
    24-07-25 14:34:31 clf_diagno INF   Test scores:
    24-07-25 14:34:31 clf_diagno INF   ------------
    24-07-25 14:34:31 clf_diagno INF   Accuracy: 0.9649122807017544
    24-07-25 14:34:31 clf_diagno INF   Precision: 0.9636363636363636
    24-07-25 14:34:31 clf_diagno INF   Recall: 0.9814814814814815
    24-07-25 14:34:31 clf_diagno INF   F1 score: 0.9724770642201835
    24-07-25 14:34:31 clf_diagno INF   Number of positives: 110 / 108
    24-07-25 14:34:31 clf_diagno INF   ROC AUC score: 0.9944885361552028
    24-07-25 14:34:31 clf_diagno INF   Log loss score: 0.09747317087777803
    24-07-25 14:34:31 clf_diagno INF   Brier score: 0.028412256825347895
    24-07-25 14:34:31 clf_diagno INF   AUC-PR score: 0.9965684957065074
    24-07-25 14:34:31 clf_diagno INF   ------------


.. code:: python

    # there are 3 predict options

    # predict (default):
    # predicts 1/0 using the probability of the classifier
    # a sample with prob=0.2 will be predicted as 1 in 20% of the cases
    y_pred = clf.predict(X_test)  # corresponds also to clf(X_test)

    # non_proba:
    # predicts 1/0 if prob>0.5
    # a sample with prob=0.2 will never be predicted as 1
    y_pred_non_proba = clf.predict_non_proba(X_test)

    # proba:
    # predicts the probability
    # a sample with prob=0.2 will have output 0.2
    y_prob = clf.predict_proba(X_test)

.. code:: python

    # the classifier can easily be saved and loaded
    clf.save("clf")
    clf = load_classifier("clf")
    file_utils.robust_remove("clf")


.. parsed-literal::

    24-07-25 14:34:31 file_utils INF   Created directory clf/clf
    24-07-25 14:34:31 classifier INF   Classifier saved to clf/clf


MultiClassifier
===============

If you have label encoded features (e.g. star/red galaxy/blue galaxy),
you can either use the standard classifier with them as input or you
split your dataset and train a classifier for each of these categories.
For this, you can use the ``MultiClassifier`` class.

.. code:: python

    # Generate such a dataset
    n_samples = 10000
    X = dict()
    X["galaxy_type"] = np.random.randint(-1, 2, n_samples)  # -1: star, 0: blue, 1: red
    X["mag"] = np.random.uniform(20, 27, n_samples)
    X["size"] = np.random.uniform(2, 5, n_samples)
    X = at.dict2rec(X)


    def detection_probability(mag, is_star):
        # Sigmoid function centered at mag=24
        base_prob = expit(-(mag - 24) * 2)
        # Suppress probability for stars
        return np.where(is_star, base_prob * 0.5, base_prob)


    prob = detection_probability(X["mag"], X["galaxy_type"] == -1)
    y = prob > np.random.uniform(0, 1, len(prob))
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

.. code:: python

    # MultiClassifier can be used the same was as the normal classifier
    clf = MultiClassifier(split_label="galaxy_type", labels=[-1, 0, 1])
    clf.train(X_train, y_train)
    clf.test(X_test, y_test)


.. parsed-literal::

    24-07-25 14:34:31 classifier INF   Training this model:
    24-07-25 14:34:31 classifier INF   CalibratedClassifierCV
    24-07-25 14:34:31 classifier INF   scaler:
    24-07-25 14:34:31 classifier INF   StandardScaler()
    24-07-25 14:34:31 classifier INF   clf:
    24-07-25 14:34:31 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, device=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=None, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=None, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=None, n_jobs=None,
                  num_parallel_tree=None, random_state=None, ...)
    24-07-25 14:34:31 classifier INF   number of samples: 2341
    24-07-25 14:34:31 classifier INF   -------------------
    24-07-25 14:34:31 classifier INF   Training completed
    24-07-25 14:34:31 classifier INF   Training this model:
    24-07-25 14:34:31 classifier INF   CalibratedClassifierCV
    24-07-25 14:34:31 classifier INF   scaler:
    24-07-25 14:34:31 classifier INF   StandardScaler()
    24-07-25 14:34:31 classifier INF   clf:
    24-07-25 14:34:31 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, device=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=None, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=None, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=None, n_jobs=None,
                  num_parallel_tree=None, random_state=None, ...)
    24-07-25 14:34:31 classifier INF   number of samples: 2288
    24-07-25 14:34:31 classifier INF   -------------------
    24-07-25 14:34:32 classifier INF   Training completed
    24-07-25 14:34:32 classifier INF   Training this model:
    24-07-25 14:34:32 classifier INF   CalibratedClassifierCV
    24-07-25 14:34:32 classifier INF   scaler:
    24-07-25 14:34:32 classifier INF   StandardScaler()
    24-07-25 14:34:32 classifier INF   clf:
    24-07-25 14:34:32 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, device=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=None, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=None, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=None, n_jobs=None,
                  num_parallel_tree=None, random_state=None, ...)
    24-07-25 14:34:32 classifier INF   number of samples: 2371
    24-07-25 14:34:32 classifier INF   -------------------
    24-07-25 14:34:32 classifier INF   Training completed
    24-07-25 14:34:32 clf_diagno INF   Test scores:
    24-07-25 14:34:32 clf_diagno INF   ------------
    24-07-25 14:34:32 clf_diagno INF   Accuracy: 0.7823333333333333
    24-07-25 14:34:32 clf_diagno INF   Precision: 0.7760381211708646
    24-07-25 14:34:32 clf_diagno INF   Recall: 0.7786885245901639
    24-07-25 14:34:32 clf_diagno INF   F1 score: 0.7773610637572451
    24-07-25 14:34:32 clf_diagno INF   Number of positives: 1469 / 1464
    24-07-25 14:34:32 clf_diagno INF   ROC AUC score: 0.9315192217383879
    24-07-25 14:34:32 clf_diagno INF   Log loss score: 0.3260016111110836
    24-07-25 14:34:32 clf_diagno INF   Brier score: 0.10553284672289355
    24-07-25 14:34:32 clf_diagno INF   AUC-PR score: 0.9335087911620549
    24-07-25 14:34:32 clf_diagno INF   ------------


.. code:: python

    # prediction, saving and loading works the same way as before
    y_pred = clf.predict(X_test)
    y_pred_non_proba = clf.predict_non_proba(X_test)
    y_prob = clf.predict_proba(X_test)

    clf.save("multi_clf")
    clf = load_multiclassifier("multi_clf")
    file_utils.robust_remove("multi_clf")


.. parsed-literal::

    24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf
    24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_-1
    24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_-1
    24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_0
    24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_0
    24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_1
    24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_1
    24-07-25 14:34:32 classifier INF   MultiClassifier saved to multi_clf/clf


Multiclass Classifier
=====================

.. code:: python

    from sklearn.datasets import make_classification
    X, y = make_classification(n_samples=10000, n_informative=4, n_classes=3)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    clf = MultiClassClassifier()
    clf.train(X_train, y_train)
    clf.test(X_test, y_test)


.. parsed-literal::

    24-09-19 17:05:01 classifier INF   Training this model:
    24-09-19 17:05:01 classifier INF   CalibratedClassifierCV
    24-09-19 17:05:01 classifier INF   scaler:
    24-09-19 17:05:01 classifier INF   StandardScaler()
    24-09-19 17:05:01 classifier INF   clf:
    24-09-19 17:05:01 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
                  colsample_bylevel=None, colsample_bynode=None,
                  colsample_bytree=None, device=None, early_stopping_rounds=None,
                  enable_categorical=False, eval_metric=None, feature_types=None,
                  gamma=None, grow_policy=None, importance_type=None,
                  interaction_constraints=None, learning_rate=None, max_bin=None,
                  max_cat_threshold=None, max_cat_to_onehot=None,
                  max_delta_step=None, max_depth=None, max_leaves=None,
                  min_child_weight=None, missing=nan, monotone_constraints=None,
                  multi_strategy=None, n_estimators=None, n_jobs=None,
                  num_parallel_tree=None, random_state=None, ...)
    24-09-19 17:05:01 classifier INF   number of samples: 7000
    24-09-19 17:05:01 classifier INF   -------------------
    24-09-19 17:05:03 classifier INF   Training completed
    24-09-19 17:05:03 clf_diagno INF   Test scores:
    24-09-19 17:05:03 clf_diagno INF   ------------
    24-09-19 17:05:03 clf_diagno INF   Accuracy: 0.8253333333333334
    24-09-19 17:05:03 clf_diagno INF   Precision: 0.8254668181132855
    24-09-19 17:05:03 clf_diagno INF   Recall: 0.8253333333333334
    24-09-19 17:05:03 clf_diagno INF   F1 score: 0.825280602771478
    24-09-19 17:05:03 clf_diagno INF   ------------


.. code:: python

    y_pred = clf.predict(X_test)
    y_pred_non_proba = clf.predict_non_proba(X_test)
    y_prob = clf.predict_proba(X_test)

    clf.save("clf")
    clf = load_classifier("clf")
    file_utils.robust_remove("clf")


.. parsed-literal::

    24-09-19 17:05:03 file_utils INF   Created directory clf/clf
    24-09-19 17:05:03 classifier INF   Classifier saved to clf/clf