from cosmic_toolbox import arraytools as at, file_utils
import numpy as np
from scipy.special import expit
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from edelweiss.classifier import (
Classifier,
MultiClassifier,
MultiClassClassifier,
load_classifier,
load_multiclassifier,
)
np.random.seed(1996)
# Load the dataset and split
data = load_breast_cancer()
X = at.arr2rec(data.data, names=data["feature_names"])
y = data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# select scaling method, classifier, if to use cross validation, and if the classifier should be calibrated
clf = Classifier(scaler="robust", clf="XGB", cv=3, cv_scoring="f1", calibrate=True)
clf.train(X_train, y_train)
clf.test(X_test, y_test)
24-07-25 14:34:16 classifier INF Training this model:
24-07-25 14:34:16 classifier INF CalibratedClassifierCV
24-07-25 14:34:16 classifier INF scaler:
24-07-25 14:34:16 classifier INF RobustScaler()
24-07-25 14:34:16 classifier INF clf:
24-07-25 14:34:16 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:16 classifier INF number of samples: 398
24-07-25 14:34:16 classifier INF -------------------
24-07-25 14:34:16 classifier INF Start cross validation
24-07-25 14:34:16 classifier INF Running the Grid search on 1 jobs
24-07-25 14:34:31 classifier INF Best parameters found by grid search: {'clf__learning_rate': 0.5, 'clf__max_depth': 3, 'clf__n_estimators': 10}
24-07-25 14:34:31 classifier INF Training completed
24-07-25 14:34:31 clf_diagno INF Test scores:
24-07-25 14:34:31 clf_diagno INF ------------
24-07-25 14:34:31 clf_diagno INF Accuracy: 0.9649122807017544
24-07-25 14:34:31 clf_diagno INF Precision: 0.9636363636363636
24-07-25 14:34:31 clf_diagno INF Recall: 0.9814814814814815
24-07-25 14:34:31 clf_diagno INF F1 score: 0.9724770642201835
24-07-25 14:34:31 clf_diagno INF Number of positives: 110 / 108
24-07-25 14:34:31 clf_diagno INF ROC AUC score: 0.9944885361552028
24-07-25 14:34:31 clf_diagno INF Log loss score: 0.09747317087777803
24-07-25 14:34:31 clf_diagno INF Brier score: 0.028412256825347895
24-07-25 14:34:31 clf_diagno INF AUC-PR score: 0.9965684957065074
24-07-25 14:34:31 clf_diagno INF ------------
# there are 3 predict options
# predict (default):
# predicts 1/0 using the probability of the classifier
# an sample with prob=0.2 will predicted as 1 in 20% of the cases
y_pred = clf.predict(X_test) # corresponds also to clf(X_test)
# non_proba:
# predicts 1/0 if prob>0.5
# an sample with prob=0.2 will never be predicted as 1
y_pred_non_proba = clf.predict_non_proba(X_test)
# proba:
# predicts the probability
# an sample with prob=0.2 will have output 0.2
y_prob = clf.predict_proba(X_test)
# the classifier can easily be saved and loaded
clf.save("clf")
clf = load_classifier("clf")
file_utils.robust_remove("clf")
24-07-25 14:34:31 file_utils INF Created directory clf/clf
24-07-25 14:34:31 classifier INF Classifier saved to clf/clf
If you have label encoded features (e.g. star/red galaxy/blue galaxy),
you can either use the standard classifier with them as input or you
split your dataset and train a classifier for each of these categories.
For this, you can use the MultiClassifier
class.
# Generate such a dataset
n_samples = 10000
X = dict()
X["galaxy_type"] = np.random.randint(-1, 2, n_samples) # -1: star, 0: blue, 1: red
X["mag"] = np.random.uniform(20, 27, n_samples)
X["size"] = np.random.uniform(2, 5, n_samples)
X = at.dict2rec(X)
def detection_probability(mag, is_star):
# Sigmoid function centered at mag=24
base_prob = expit(-(mag - 24) * 2)
# Suppress probability for stars
return np.where(is_star, base_prob * 0.5, base_prob)
prob = detection_probability(X["mag"], X["galaxy_type"] == -1)
y = prob > np.random.uniform(0, 1, len(prob))
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
# MultiClassifier can be used the same was as the normal classifier
clf = MultiClassifier(split_label="galaxy_type", labels=[-1, 0, 1])
clf.train(X_train, y_train)
clf.test(X_test, y_test)
24-07-25 14:34:31 classifier INF Training this model:
24-07-25 14:34:31 classifier INF CalibratedClassifierCV
24-07-25 14:34:31 classifier INF scaler:
24-07-25 14:34:31 classifier INF StandardScaler()
24-07-25 14:34:31 classifier INF clf:
24-07-25 14:34:31 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:31 classifier INF number of samples: 2341
24-07-25 14:34:31 classifier INF -------------------
24-07-25 14:34:31 classifier INF Training completed
24-07-25 14:34:31 classifier INF Training this model:
24-07-25 14:34:31 classifier INF CalibratedClassifierCV
24-07-25 14:34:31 classifier INF scaler:
24-07-25 14:34:31 classifier INF StandardScaler()
24-07-25 14:34:31 classifier INF clf:
24-07-25 14:34:31 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:31 classifier INF number of samples: 2288
24-07-25 14:34:31 classifier INF -------------------
24-07-25 14:34:32 classifier INF Training completed
24-07-25 14:34:32 classifier INF Training this model:
24-07-25 14:34:32 classifier INF CalibratedClassifierCV
24-07-25 14:34:32 classifier INF scaler:
24-07-25 14:34:32 classifier INF StandardScaler()
24-07-25 14:34:32 classifier INF clf:
24-07-25 14:34:32 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
24-07-25 14:34:32 classifier INF number of samples: 2371
24-07-25 14:34:32 classifier INF -------------------
24-07-25 14:34:32 classifier INF Training completed
24-07-25 14:34:32 clf_diagno INF Test scores:
24-07-25 14:34:32 clf_diagno INF ------------
24-07-25 14:34:32 clf_diagno INF Accuracy: 0.7823333333333333
24-07-25 14:34:32 clf_diagno INF Precision: 0.7760381211708646
24-07-25 14:34:32 clf_diagno INF Recall: 0.7786885245901639
24-07-25 14:34:32 clf_diagno INF F1 score: 0.7773610637572451
24-07-25 14:34:32 clf_diagno INF Number of positives: 1469 / 1464
24-07-25 14:34:32 clf_diagno INF ROC AUC score: 0.9315192217383879
24-07-25 14:34:32 clf_diagno INF Log loss score: 0.3260016111110836
24-07-25 14:34:32 clf_diagno INF Brier score: 0.10553284672289355
24-07-25 14:34:32 clf_diagno INF AUC-PR score: 0.9335087911620549
24-07-25 14:34:32 clf_diagno INF ------------
# prediction, saving and loading works the same way as before
y_pred = clf.predict(X_test)
y_pred_non_proba = clf.predict_non_proba(X_test)
y_prob = clf.predict_proba(X_test)
clf.save("multi_clf")
clf = load_multiclassifier("multi_clf")
file_utils.robust_remove("multi_clf")
24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf
24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_-1
24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_-1
24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_0
24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_0
24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_1
24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_1
24-07-25 14:34:32 classifier INF MultiClassifier saved to multi_clf/clf
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_informative=4, n_classes=3)
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.3, random_state=42
)
clf = MultiClassClassifier()
clf.train(X_train, y_train)
clf.test(X_test, y_test)
24-09-19 17:05:01 classifier INF Training this model:
24-09-19 17:05:01 classifier INF CalibratedClassifierCV
24-09-19 17:05:01 classifier INF scaler:
24-09-19 17:05:01 classifier INF StandardScaler()
24-09-19 17:05:01 classifier INF clf:
24-09-19 17:05:01 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=None, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None, n_jobs=None,
num_parallel_tree=None, random_state=None, ...)
24-09-19 17:05:01 classifier INF number of samples: 7000
24-09-19 17:05:01 classifier INF -------------------
24-09-19 17:05:03 classifier INF Training completed
24-09-19 17:05:03 clf_diagno INF Test scores:
24-09-19 17:05:03 clf_diagno INF ------------
24-09-19 17:05:03 clf_diagno INF Accuracy: 0.8253333333333334
24-09-19 17:05:03 clf_diagno INF Precision: 0.8254668181132855
24-09-19 17:05:03 clf_diagno INF Recall: 0.8253333333333334
24-09-19 17:05:03 clf_diagno INF F1 score: 0.825280602771478
24-09-19 17:05:03 clf_diagno INF ------------
y_pred = clf.predict(X_test)
y_pred_non_proba = clf.predict_non_proba(X_test)
y_prob = clf.predict_proba(X_test)
clf.save("clf")
clf = load_classifier("clf")
file_utils.robust_remove("clf")
24-09-19 17:05:03 file_utils INF Created directory clf/clf
24-09-19 17:05:03 classifier INF Classifier saved to clf/clf