Training the classifier

Let's use the logistic regression classifier, which has already served us well in the Chapter 9, Classification II – Sentiment Analysis:

from sklearn.linear_model.logistic import LogisticRegression
def create_model():
    return LogisticRegression()

Just to mention one surprising aspect: the evaluation of accuracy rates when first switching from binary to multiclass classification. In binary classification problems, we have learned that an accuracy of 50% is the worst case, as it could have been achieved by random guessing. In multiclass settings, 50% can already be very good. With our six genres, for instance, random guessing would result in only 16.7% (equal class sizes assumed).

The full training procedure now looks as follows:

from collections import defaultdict
from sklearn.metrics import precision_recall_curve, roc_curve, 
                            confusion_matrix
from sklearn.metrics import auc
from sklearn.model_selection import ShuffleSplit

def train_model(clf_factory, X, Y):
    labels = np.unique(Y)

    cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

    train_errors = []
    test_errors = []

    scores = []
    pr_scores = defaultdict(list)
    precisions = defaultdict(list)
    recalls = defaultdict(list)
    thresholds = defaultdict(list)

    roc_scores = defaultdict(list)
    tprs = defaultdict(list)
    fprs = defaultdict(list)

    clfs = [] # used to later get the median

    cms = []

    for train, test in cv:
        X_train, y_train = X[train], Y[train]
        X_test, y_test = X[test], Y[test]

        clf = clf_factory()
        clf.fit(X_train, y_train)
        clfs.append(clf)

        train_score = clf.score(X_train, y_train)
        test_score = clf.score(X_test, y_test)
        scores.append(test_score)

        train_errors.append(1 - train_score)
        test_errors.append(1 - test_score)

        y_pred = clf.predict(X_test)
        cm = confusion_matrix(y_test, y_pred) # will be explained soon
        cms.append(cm)

        for label in labels:
            y_label_test = np.asarray(y_test == label, dtype=int)
            proba = clf.predict_proba(X_test)
            proba_label = proba[:, label]

            precision, recall, pr_thresholds = preci-sion_recall_curve(
                y_label_test, proba_label)
            pr_scores[label].append(auc(recall, precision))
            precisions[label].append(precision)
            recalls[label].append(recall)
            thresholds[label].append(pr_thresholds)

            fpr, tpr, roc_thresholds = roc_curve(y_label_test, 
                                                        pro-ba_label)
            roc_scores[label].append(auc(fpr, tpr))
            tprs[label].append(tpr)
            fprs[label].append(fpr)

    all_pr_scores = np.asarray(pr_scores.values()).flatten()
    summary = (np.mean(scores), np.std(scores),
                 np.mean(all_pr_scores), np.std(all_pr_scores))
    print("%.3f	%.3f	%.3f	%.3f	" % summary)

    return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)

The whole training invocation is as follows:

X, Y = read_fft(GENRES)
train_avg, test_avg, cms = train_model(create_model, X, Y)

Table of Contents for Training the classifier

Create new playlist

Sign In

Sign Up

Table of Contents for
Training the classifier