Training the classifier

Let's use the logistic regression classifier, which has already served us well in the Chapter 9, Classification II – Sentiment Analysis:

from sklearn.linear_model.logistic import LogisticRegression
def create_model():
return LogisticRegression()

Just to mention one surprising aspect: the evaluation of accuracy rates when first switching from binary to multiclass classification. In binary classification problems, we have learned that an accuracy of 50% is the worst case, as it could have been achieved by random guessing. In multiclass settings, 50% can already be very good. With our six genres, for instance, random guessing would result in only 16.7% (equal class sizes assumed).

The full training procedure now looks as follows:

from collections import defaultdict
from sklearn.metrics import precision_recall_curve, roc_curve,
confusion_matrix
from sklearn.metrics import auc
from sklearn.model_selection import ShuffleSplit

def train_model(clf_factory, X, Y):
labels = np.unique(Y)

cv = ShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

train_errors = []
test_errors = []

scores = []
pr_scores = defaultdict(list)
precisions = defaultdict(list)
recalls = defaultdict(list)
thresholds = defaultdict(list)

roc_scores = defaultdict(list)
tprs = defaultdict(list)
fprs = defaultdict(list)

clfs = [] # used to later get the median

cms = []

for train, test in cv:
X_train, y_train = X[train], Y[train]
X_test, y_test = X[test], Y[test]

clf = clf_factory()
clf.fit(X_train, y_train)
clfs.append(clf)

train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
scores.append(test_score)

train_errors.append(1 - train_score)
test_errors.append(1 - test_score)

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred) # will be explained soon
cms.append(cm)

for label in labels:
y_label_test = np.asarray(y_test == label, dtype=int)
proba = clf.predict_proba(X_test)
proba_label = proba[:, label]

precision, recall, pr_thresholds = preci-sion_recall_curve(
y_label_test, proba_label)
pr_scores[label].append(auc(recall, precision))
precisions[label].append(precision)
recalls[label].append(recall)
thresholds[label].append(pr_thresholds)

fpr, tpr, roc_thresholds = roc_curve(y_label_test,
pro-ba_label)
roc_scores[label].append(auc(fpr, tpr))
tprs[label].append(tpr)
fprs[label].append(fpr)

all_pr_scores = np.asarray(pr_scores.values()).flatten()
summary = (np.mean(scores), np.std(scores),
np.mean(all_pr_scores), np.std(all_pr_scores))
print("%.3f %.3f %.3f %.3f " % summary)

return np.mean(train_errors), np.mean(test_errors), np.asarray(cms)

The whole training invocation is as follows:

X, Y = read_fft(GENRES)
train_avg, test_avg, cms = train_model(create_model, X, Y)
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset