Usage: Classifier

from cosmic_toolbox import arraytools as at, file_utils
import numpy as np
from scipy.special import expit
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from edelweiss.classifier import (
    Classifier,
    MultiClassifier,
    MultiClassClassifier,
    load_classifier,
    load_multiclassifier,
)

np.random.seed(1996)

Classifier

# Load the dataset and split
data = load_breast_cancer()
X = at.arr2rec(data.data, names=data["feature_names"])
y = data.target
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# select scaling method, classifier, if to use cross validation, and if the classifier should be calibrated
clf = Classifier(scaler="robust", clf="XGB", cv=3, cv_scoring="f1", calibrate=True)
clf.train(X_train, y_train)
clf.test(X_test, y_test)

24-07-25 14:34:16 classifier INF   Training this model:
24-07-25 14:34:16 classifier INF   CalibratedClassifierCV
24-07-25 14:34:16 classifier INF   scaler:
24-07-25 14:34:16 classifier INF   RobustScaler()
24-07-25 14:34:16 classifier INF   clf:
24-07-25 14:34:16 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:16 classifier INF   number of samples: 398
24-07-25 14:34:16 classifier INF   -------------------
24-07-25 14:34:16 classifier INF   Start cross validation
24-07-25 14:34:16 classifier INF   Running the Grid search on 1 jobs
24-07-25 14:34:31 classifier INF   Best parameters found by grid search: {'clf__learning_rate': 0.5, 'clf__max_depth': 3, 'clf__n_estimators': 10}
24-07-25 14:34:31 classifier INF   Training completed
24-07-25 14:34:31 clf_diagno INF   Test scores:
24-07-25 14:34:31 clf_diagno INF   ------------
24-07-25 14:34:31 clf_diagno INF   Accuracy: 0.9649122807017544
24-07-25 14:34:31 clf_diagno INF   Precision: 0.9636363636363636
24-07-25 14:34:31 clf_diagno INF   Recall: 0.9814814814814815
24-07-25 14:34:31 clf_diagno INF   F1 score: 0.9724770642201835
24-07-25 14:34:31 clf_diagno INF   Number of positives: 110 / 108
24-07-25 14:34:31 clf_diagno INF   ROC AUC score: 0.9944885361552028
24-07-25 14:34:31 clf_diagno INF   Log loss score: 0.09747317087777803
24-07-25 14:34:31 clf_diagno INF   Brier score: 0.028412256825347895
24-07-25 14:34:31 clf_diagno INF   AUC-PR score: 0.9965684957065074
24-07-25 14:34:31 clf_diagno INF   ------------

# there are 3 predict options

# predict (default):
# predicts 1/0 using the probability of the classifier
# a sample with prob=0.2 will be predicted as 1 in 20% of the cases
y_pred = clf.predict(X_test)  # corresponds also to clf(X_test)

# non_proba:
# predicts 1/0 if prob>0.5
# a sample with prob=0.2 will never be predicted as 1
y_pred_non_proba = clf.predict_non_proba(X_test)

# proba:
# predicts the probability
# a sample with prob=0.2 will have output 0.2
y_prob = clf.predict_proba(X_test)

# the classifier can easily be saved and loaded
clf.save("clf")
clf = load_classifier("clf")
file_utils.robust_remove("clf")

24-07-25 14:34:31 file_utils INF   Created directory clf/clf
24-07-25 14:34:31 classifier INF   Classifier saved to clf/clf

MultiClassifier

If you have label encoded features (e.g. star/red galaxy/blue galaxy), you can either use the standard classifier with them as input or you split your dataset and train a classifier for each of these categories. For this, you can use the MultiClassifier class.

# Generate such a dataset
n_samples = 10000
X = dict()
X["galaxy_type"] = np.random.randint(-1, 2, n_samples)  # -1: star, 0: blue, 1: red
X["mag"] = np.random.uniform(20, 27, n_samples)
X["size"] = np.random.uniform(2, 5, n_samples)
X = at.dict2rec(X)


def detection_probability(mag, is_star):
    # Sigmoid function centered at mag=24
    base_prob = expit(-(mag - 24) * 2)
    # Suppress probability for stars
    return np.where(is_star, base_prob * 0.5, base_prob)


prob = detection_probability(X["mag"], X["galaxy_type"] == -1)
y = prob > np.random.uniform(0, 1, len(prob))
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# MultiClassifier can be used the same was as the normal classifier
clf = MultiClassifier(split_label="galaxy_type", labels=[-1, 0, 1])
clf.train(X_train, y_train)
clf.test(X_test, y_test)

24-07-25 14:34:31 classifier INF   Training this model:
24-07-25 14:34:31 classifier INF   CalibratedClassifierCV
24-07-25 14:34:31 classifier INF   scaler:
24-07-25 14:34:31 classifier INF   StandardScaler()
24-07-25 14:34:31 classifier INF   clf:
24-07-25 14:34:31 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:31 classifier INF   number of samples: 2341
24-07-25 14:34:31 classifier INF   -------------------
24-07-25 14:34:31 classifier INF   Training completed
24-07-25 14:34:31 classifier INF   Training this model:
24-07-25 14:34:31 classifier INF   CalibratedClassifierCV
24-07-25 14:34:31 classifier INF   scaler:
24-07-25 14:34:31 classifier INF   StandardScaler()
24-07-25 14:34:31 classifier INF   clf:
24-07-25 14:34:31 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:31 classifier INF   number of samples: 2288
24-07-25 14:34:31 classifier INF   -------------------
24-07-25 14:34:32 classifier INF   Training completed
24-07-25 14:34:32 classifier INF   Training this model:
24-07-25 14:34:32 classifier INF   CalibratedClassifierCV
24-07-25 14:34:32 classifier INF   scaler:
24-07-25 14:34:32 classifier INF   StandardScaler()
24-07-25 14:34:32 classifier INF   clf:
24-07-25 14:34:32 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:32 classifier INF   number of samples: 2371
24-07-25 14:34:32 classifier INF   -------------------
24-07-25 14:34:32 classifier INF   Training completed
24-07-25 14:34:32 clf_diagno INF   Test scores:
24-07-25 14:34:32 clf_diagno INF   ------------
24-07-25 14:34:32 clf_diagno INF   Accuracy: 0.7823333333333333
24-07-25 14:34:32 clf_diagno INF   Precision: 0.7760381211708646
24-07-25 14:34:32 clf_diagno INF   Recall: 0.7786885245901639
24-07-25 14:34:32 clf_diagno INF   F1 score: 0.7773610637572451
24-07-25 14:34:32 clf_diagno INF   Number of positives: 1469 / 1464
24-07-25 14:34:32 clf_diagno INF   ROC AUC score: 0.9315192217383879
24-07-25 14:34:32 clf_diagno INF   Log loss score: 0.3260016111110836
24-07-25 14:34:32 clf_diagno INF   Brier score: 0.10553284672289355
24-07-25 14:34:32 clf_diagno INF   AUC-PR score: 0.9335087911620549
24-07-25 14:34:32 clf_diagno INF   ------------

# prediction, saving and loading works the same way as before
y_pred = clf.predict(X_test)
y_pred_non_proba = clf.predict_non_proba(X_test)
y_prob = clf.predict_proba(X_test)

clf.save("multi_clf")
clf = load_multiclassifier("multi_clf")
file_utils.robust_remove("multi_clf")

24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf
24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_-1
24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_-1
24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_0
24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_0
24-07-25 14:34:32 file_utils INF   Created directory multi_clf/clf_1
24-07-25 14:34:32 classifier INF   Classifier saved to multi_clf/clf_1
24-07-25 14:34:32 classifier INF   MultiClassifier saved to multi_clf/clf

Multiclass Classifier

from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_informative=4, n_classes=3)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

clf = MultiClassClassifier()
clf.train(X_train, y_train)
clf.test(X_test, y_test)

24-09-19 17:05:01 classifier INF   Training this model:
24-09-19 17:05:01 classifier INF   CalibratedClassifierCV
24-09-19 17:05:01 classifier INF   scaler:
24-09-19 17:05:01 classifier INF   StandardScaler()
24-09-19 17:05:01 classifier INF   clf:
24-09-19 17:05:01 classifier INF   XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)
24-09-19 17:05:01 classifier INF   number of samples: 7000
24-09-19 17:05:01 classifier INF   -------------------
24-09-19 17:05:03 classifier INF   Training completed
24-09-19 17:05:03 clf_diagno INF   Test scores:
24-09-19 17:05:03 clf_diagno INF   ------------
24-09-19 17:05:03 clf_diagno INF   Accuracy: 0.8253333333333334
24-09-19 17:05:03 clf_diagno INF   Precision: 0.8254668181132855
24-09-19 17:05:03 clf_diagno INF   Recall: 0.8253333333333334
24-09-19 17:05:03 clf_diagno INF   F1 score: 0.825280602771478
24-09-19 17:05:03 clf_diagno INF   ------------

y_pred = clf.predict(X_test)
y_pred_non_proba = clf.predict_non_proba(X_test)
y_prob = clf.predict_proba(X_test)

clf.save("clf")
clf = load_classifier("clf")
file_utils.robust_remove("clf")

24-09-19 17:05:03 file_utils INF   Created directory clf/clf
24-09-19 17:05:03 classifier INF   Classifier saved to clf/clf