AdaBoost简介

AdaBoost是Boosting家族中最著名的算法，它是一种将多个弱学习者（也称为基础学习者）转化为强学习者的算法。AdaBoost算法的推导有很多种，最简单的一种是基于加法模型，即弱学习者的线性组合。

为了最小化loss函数，我们有

以下是AdaBoost的算法流程：

代码示例

特征提取

import numpy


class NPDFeature():
    """It is a tool class to extract the NPD features.

    Attributes:
        image: A two-dimension ndarray indicating grayscale image.
        n_pixels: An integer indicating the number of image total pixels.
        features: A one-dimension ndarray to store the extracted NPD features.
    """
    __NPD_table__ = None

    def __init__(self, image):
        '''Initialize NPDFeature class with an image.'''
        if NPDFeature.__NPD_table__ is None:
            NPDFeature.__NPD_table__ = NPDFeature.__calculate_NPD_table()
        assert isinstance(image, numpy.ndarray)
        self.image = image.ravel()
        self.n_pixels = image.size
        self.features = numpy.empty(shape=self.n_pixels * (self.n_pixels - 1) // 2, dtype=float)

    def extract(self):
        '''Extract features from given image.

        Returns:
            A one-dimension ndarray to store the extracted NPD features.
        '''
        count = 0
        for i in range(self.n_pixels - 1):
            for j in range(i + 1, self.n_pixels, 1):
                self.features[count] = NPDFeature.__NPD_table__[self.image[i]][self.image[j]]
                count += 1
        return self.features

    @staticmethod
    def __calculate_NPD_table():
        '''Calculate all situations table to accelerate feature extracting.'''
        print("Calculating the NPD table...")
        table = numpy.empty(shape=(1 << 8, 1 << 8), dtype=float)
        for i in range(1 << 8):
            for j in range(1 << 8):
                if i == 0 and j == 0:
                    table[i][j] = 0
                else:
                    table[i][j] = (i - j) / (i + j)
        return table

import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import os
from PIL import Image
import matplotlib.pyplot as plt
import pickle
import cv2 as cv
import random
from feature import NPDFeature
from ensemble import AdaBoostClassifier

facePath = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\original\\face\\'
nonfacePath = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\original\\nonface\\'

face_data_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\face_data.txt'
nonface_data_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\nonface_data.txt'

def dumpFeatures(path, label = 1):
    img_names = os.listdir(path)
    file = ''
    if label == 1:
        file = face_data_file
    else:
        file = nonface_data_file
    write = open(file, 'wb')
    feat = []
    for name in img_names:
        img_gray = cv.imread(path+name, cv.IMREAD_GRAYSCALE)
        img_resize = cv.resize(img_gray, (24,24))
        npd = NPDFeature(img_resize)
        feat.append(npd.extract())
    pickle.dump(feat, write)

def loadFeatures(filename):
    read = open(filename, 'rb')
    data = pickle.load(file=read)
    return np.matrix(data)

def addLabel(data, label):
    data = np.matrix(data)
    return np.concatenate((np.full(shape=(data.shape[0],1),fill_value=label), data), axis=1)

def split_train_valid(data, fraction = 0.9):
    data = np.matrix(data)
    return sk.model_selection.train_test_split(data[:,0],data[:,1:data.shape[1]],train_size=fraction, test_size=1-fraction)

# dumpFeatures(facePath, label=1)
# dumpFeatures(nonfacePath, label=0)

face_data = loadFeatures(face_data_file)
nonface_data = loadFeatures(nonface_data_file)

face_data = addLabel(face_data,1)
nonface_data = addLabel(nonface_data,0)

print(face_data.shape)
print(nonface_data.shape)

data = np.concatenate((face_data, nonface_data), axis=0)

fraction = 0.9
X_train, X_valid, y_train, y_valid = sk.model_selection.train_test_split(data[:,1:data.shape[1]],data[:,0],train_size=fraction, test_size=1-fraction)

X_train = np.matrix(X_train)
X_valid = np.matrix(X_valid)
y_train = np.matrix(y_train)
y_valid = np.matrix(y_valid)

print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
print(np.sum(y_train==1), np.sum(y_valid==1))

weight = np.full(shape=(X_train.shape[0]), fill_value=1/X_train.shape[0])
weak_classifier = DecisionTreeClassifier()

# weak_classifier.fit(X_train, y_train, sample_weight=weight)
# train_score = weak_classifier.score(X_train, y_train)
# valid_score = weak_classifier.score(X_valid, y_valid)

classifier = AdaBoostClassifier(weak_classifier, 10)
classifier.fit(X_train, y_train)
# hx = classifier.predict(X_valid)
train_score = classifier.predict_scores(X_train, y_train)
valid_score = classifier.predict_scores(X_valid, y_valid)

print('{}, {}'.format(train_score, valid_score))

hx = classifier.predict(X_valid)
print(hx)
report_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\report.txt'
file_write = open(report_file, 'wb')
print(classification_report(y_valid, hx))

加性融合

import pickle
import numpy as np
from sklearn.tree import DecisionTreeClassifier


class AdaBoostClassifier:
    '''A simple AdaBoost Classifier.'''

    def __init__(self, weak_classifier, n_weakers_limit):
        '''Initialize AdaBoostClassifier

        Args:
            weak_classifier: The class of weak classifier, which is recommend to be sklearn.tree.DecisionTreeClassifier.
            n_weakers_limit: The maximum number of weak classifier the model can use.
        '''
        self.weak_classifier_list = []
        self.classifier_weight = []
        self.n_weakers_limit = n_weakers_limit
        self.Hx = []
        self.pred_list = []

    def is_good_enough(self):
        '''Optional'''
        pass

    def fit(self, X, y):
        '''Build a boosted classifier from the training set (X, y).
        Args:
            X: An ndarray indicating the samples to be trained, which shape should be (n_samples,n_features).
            y: An ndarray indicating the ground-truth labels correspond to X, which shape should be (n_samples,1).
        '''

        self.Hx = np.zeros((len(y), self.n_weakers_limit))
        weight = np.full(shape=(X.shape[0]), fill_value=1/X.shape[0])
        print(weight.shape)
        for iters in range(self.n_weakers_limit):

            weak_classifier = DecisionTreeClassifier(
            criterion='entropy',
            splitter='random',
            max_features='log2',
            max_depth=10,
            max_leaf_nodes=10,
            min_samples_split=10,
            min_samples_leaf=3,
            class_weight='balanced'
            )

            weak_classifier.fit(X, y, sample_weight=weight)
            hx = weak_classifier.predict(X)
            score = weak_classifier.score(X, y)

            print('score {}: '.format(iters+1), score)

            error = 1 - score
            alpha = 0.5 * np.log((1-error)/error)

            exp = np.exp(-alpha * np.multiply(hx, y.flatten()))
            exp = np.array(exp).flatten()
            weight = np.multiply(weight, exp)
            zm = np.sum(weight)
            weight = weight / zm

            if(score>0.5):
                self.classifier_weight.append(alpha)
                self.weak_classifier_list.append(weak_classifier)
                self.Hx[:,iters] = alpha*np.array(hx)
                self.pred_list.append(np.array(hx))

    def predict_scores(self, X, y):
        '''Calculate the weighted sum score of the whole base classifiers for given samples.

        Args:
            X: An ndarray indicating the samples to be predicted, which shape should be (n_samples,n_features).

        Returns:
            An one-dimension ndarray indicating the scores of differnt samples, which shape should be (n_samples,1).
        '''
        Hx = np.zeros((len(y), self.n_weakers_limit))
        for i in range(len(self.weak_classifier_list)):
            hx = self.weak_classifier_list[i].predict(X)
            Hx[:,i] = self.classifier_weight[i]*np.array(hx)
        pred = np.sum(Hx, axis=1)
        pred = (pred>=1)
        accuracy = (pred==y.flatten())
        return np.mean(accuracy)

    def predict(self, X, threshold=0):
        '''Predict the catagories for given samples.

        Args:
            X: An ndarray indicating the samples to be predicted, which shape should be (n_samples,n_features).
            threshold: The demarcation number of deviding the samples into two parts.

        Returns:
            An ndarray consists of predicted labels, which shape should be (n_samples,1).
        '''
        Hx = np.zeros((X.shape[0], self.n_weakers_limit))
        for i in range(len(self.weak_classifier_list)):
            hx = self.weak_classifier_list[i].predict(X)
            Hx[:,i] = self.classifier_weight[i]*np.array(hx)
        pred = np.sum(Hx, axis=1)
        pred[pred>=1] = 1
        pred[pred<1] = 0
        return pred

    @staticmethod
    def save(model, filename):
        with open(filename, "wb") as f:
            pickle.dump(model, f)

    @staticmethod
    def load(filename):
        with open(filename, "rb") as f:
            return pickle.load(f)

训练

1
2
3

if __name__ == "__main__":
    # write your code here
    pass

准确率

Author: WJZheng

Link: https://wellenzheng.github.io/2020/04/13/%E5%9F%BA%E4%BA%8EAdaBoost%E7%9A%84%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95/

Machine Learning

Recommend

Comment