============ Usage: Classifier ============ .. code:: python from cosmic_toolbox import arraytools as at, file_utils import numpy as np from scipy.special import expit from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from edelweiss.classifier import ( Classifier, MultiClassifier, MultiClassClassifier, load_classifier, load_multiclassifier, ) np.random.seed(1996) Classifier ========== .. code:: python # Load the dataset and split data = load_breast_cancer() X = at.arr2rec(data.data, names=data["feature_names"]) y = data.target X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) .. code:: python # select scaling method, classifier, if to use cross validation, and if the classifier should be calibrated clf = Classifier(scaler="robust", clf="XGB", cv=3, cv_scoring="f1", calibrate=True) clf.train(X_train, y_train) clf.test(X_test, y_test) .. parsed-literal:: 24-07-25 14:34:16 classifier INF Training this model: 24-07-25 14:34:16 classifier INF CalibratedClassifierCV 24-07-25 14:34:16 classifier INF scaler: 24-07-25 14:34:16 classifier INF RobustScaler() 24-07-25 14:34:16 classifier INF clf: 24-07-25 14:34:16 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...) 24-07-25 14:34:16 classifier INF number of samples: 398 24-07-25 14:34:16 classifier INF ------------------- 24-07-25 14:34:16 classifier INF Start cross validation 24-07-25 14:34:16 classifier INF Running the Grid search on 1 jobs 24-07-25 14:34:31 classifier INF Best parameters found by grid search: {'clf__learning_rate': 0.5, 'clf__max_depth': 3, 'clf__n_estimators': 10} 24-07-25 14:34:31 classifier INF Training completed 24-07-25 14:34:31 clf_diagno INF Test scores: 24-07-25 14:34:31 clf_diagno INF ------------ 24-07-25 14:34:31 clf_diagno INF Accuracy: 0.9649122807017544 24-07-25 14:34:31 clf_diagno INF Precision: 0.9636363636363636 24-07-25 14:34:31 clf_diagno INF Recall: 0.9814814814814815 24-07-25 14:34:31 clf_diagno INF F1 score: 0.9724770642201835 24-07-25 14:34:31 clf_diagno INF Number of positives: 110 / 108 24-07-25 14:34:31 clf_diagno INF ROC AUC score: 0.9944885361552028 24-07-25 14:34:31 clf_diagno INF Log loss score: 0.09747317087777803 24-07-25 14:34:31 clf_diagno INF Brier score: 0.028412256825347895 24-07-25 14:34:31 clf_diagno INF AUC-PR score: 0.9965684957065074 24-07-25 14:34:31 clf_diagno INF ------------ .. code:: python # there are 3 predict options # predict (default): # predicts 1/0 using the probability of the classifier # a sample with prob=0.2 will be predicted as 1 in 20% of the cases y_pred = clf.predict(X_test) # corresponds also to clf(X_test) # non_proba: # predicts 1/0 if prob>0.5 # a sample with prob=0.2 will never be predicted as 1 y_pred_non_proba = clf.predict_non_proba(X_test) # proba: # predicts the probability # a sample with prob=0.2 will have output 0.2 y_prob = clf.predict_proba(X_test) .. code:: python # the classifier can easily be saved and loaded clf.save("clf") clf = load_classifier("clf") file_utils.robust_remove("clf") .. parsed-literal:: 24-07-25 14:34:31 file_utils INF Created directory clf/clf 24-07-25 14:34:31 classifier INF Classifier saved to clf/clf MultiClassifier =============== If you have label encoded features (e.g. star/red galaxy/blue galaxy), you can either use the standard classifier with them as input or you split your dataset and train a classifier for each of these categories. For this, you can use the ``MultiClassifier`` class. .. code:: python # Generate such a dataset n_samples = 10000 X = dict() X["galaxy_type"] = np.random.randint(-1, 2, n_samples) # -1: star, 0: blue, 1: red X["mag"] = np.random.uniform(20, 27, n_samples) X["size"] = np.random.uniform(2, 5, n_samples) X = at.dict2rec(X) def detection_probability(mag, is_star): # Sigmoid function centered at mag=24 base_prob = expit(-(mag - 24) * 2) # Suppress probability for stars return np.where(is_star, base_prob * 0.5, base_prob) prob = detection_probability(X["mag"], X["galaxy_type"] == -1) y = prob > np.random.uniform(0, 1, len(prob)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) .. code:: python # MultiClassifier can be used the same was as the normal classifier clf = MultiClassifier(split_label="galaxy_type", labels=[-1, 0, 1]) clf.train(X_train, y_train) clf.test(X_test, y_test) .. parsed-literal:: 24-07-25 14:34:31 classifier INF Training this model: 24-07-25 14:34:31 classifier INF CalibratedClassifierCV 24-07-25 14:34:31 classifier INF scaler: 24-07-25 14:34:31 classifier INF StandardScaler() 24-07-25 14:34:31 classifier INF clf: 24-07-25 14:34:31 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...) 24-07-25 14:34:31 classifier INF number of samples: 2341 24-07-25 14:34:31 classifier INF ------------------- 24-07-25 14:34:31 classifier INF Training completed 24-07-25 14:34:31 classifier INF Training this model: 24-07-25 14:34:31 classifier INF CalibratedClassifierCV 24-07-25 14:34:31 classifier INF scaler: 24-07-25 14:34:31 classifier INF StandardScaler() 24-07-25 14:34:31 classifier INF clf: 24-07-25 14:34:31 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...) 24-07-25 14:34:31 classifier INF number of samples: 2288 24-07-25 14:34:31 classifier INF ------------------- 24-07-25 14:34:32 classifier INF Training completed 24-07-25 14:34:32 classifier INF Training this model: 24-07-25 14:34:32 classifier INF CalibratedClassifierCV 24-07-25 14:34:32 classifier INF scaler: 24-07-25 14:34:32 classifier INF StandardScaler() 24-07-25 14:34:32 classifier INF clf: 24-07-25 14:34:32 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...) 24-07-25 14:34:32 classifier INF number of samples: 2371 24-07-25 14:34:32 classifier INF ------------------- 24-07-25 14:34:32 classifier INF Training completed 24-07-25 14:34:32 clf_diagno INF Test scores: 24-07-25 14:34:32 clf_diagno INF ------------ 24-07-25 14:34:32 clf_diagno INF Accuracy: 0.7823333333333333 24-07-25 14:34:32 clf_diagno INF Precision: 0.7760381211708646 24-07-25 14:34:32 clf_diagno INF Recall: 0.7786885245901639 24-07-25 14:34:32 clf_diagno INF F1 score: 0.7773610637572451 24-07-25 14:34:32 clf_diagno INF Number of positives: 1469 / 1464 24-07-25 14:34:32 clf_diagno INF ROC AUC score: 0.9315192217383879 24-07-25 14:34:32 clf_diagno INF Log loss score: 0.3260016111110836 24-07-25 14:34:32 clf_diagno INF Brier score: 0.10553284672289355 24-07-25 14:34:32 clf_diagno INF AUC-PR score: 0.9335087911620549 24-07-25 14:34:32 clf_diagno INF ------------ .. code:: python # prediction, saving and loading works the same way as before y_pred = clf.predict(X_test) y_pred_non_proba = clf.predict_non_proba(X_test) y_prob = clf.predict_proba(X_test) clf.save("multi_clf") clf = load_multiclassifier("multi_clf") file_utils.robust_remove("multi_clf") .. parsed-literal:: 24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf 24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_-1 24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_-1 24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_0 24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_0 24-07-25 14:34:32 file_utils INF Created directory multi_clf/clf_1 24-07-25 14:34:32 classifier INF Classifier saved to multi_clf/clf_1 24-07-25 14:34:32 classifier INF MultiClassifier saved to multi_clf/clf Multiclass Classifier ===================== .. code:: python from sklearn.datasets import make_classification X, y = make_classification(n_samples=10000, n_informative=4, n_classes=3) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, random_state=42 ) clf = MultiClassClassifier() clf.train(X_train, y_train) clf.test(X_test, y_test) .. parsed-literal:: 24-09-19 17:05:01 classifier INF Training this model: 24-09-19 17:05:01 classifier INF CalibratedClassifierCV 24-09-19 17:05:01 classifier INF scaler: 24-09-19 17:05:01 classifier INF StandardScaler() 24-09-19 17:05:01 classifier INF clf: 24-09-19 17:05:01 classifier INF XGBClassifier(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=None, n_jobs=None, num_parallel_tree=None, random_state=None, ...) 24-09-19 17:05:01 classifier INF number of samples: 7000 24-09-19 17:05:01 classifier INF ------------------- 24-09-19 17:05:03 classifier INF Training completed 24-09-19 17:05:03 clf_diagno INF Test scores: 24-09-19 17:05:03 clf_diagno INF ------------ 24-09-19 17:05:03 clf_diagno INF Accuracy: 0.8253333333333334 24-09-19 17:05:03 clf_diagno INF Precision: 0.8254668181132855 24-09-19 17:05:03 clf_diagno INF Recall: 0.8253333333333334 24-09-19 17:05:03 clf_diagno INF F1 score: 0.825280602771478 24-09-19 17:05:03 clf_diagno INF ------------ .. code:: python y_pred = clf.predict(X_test) y_pred_non_proba = clf.predict_non_proba(X_test) y_prob = clf.predict_proba(X_test) clf.save("clf") clf = load_classifier("clf") file_utils.robust_remove("clf") .. parsed-literal:: 24-09-19 17:05:03 file_utils INF Created directory clf/clf 24-09-19 17:05:03 classifier INF Classifier saved to clf/clf