avatar

基于AdaBoost的人脸识别算法

AdaBoost简介


AdaBoost是Boosting家族中最著名的算法,它是一种将多个弱学习者(也称为基础学习者)转化为强学习者的算法。AdaBoost算法的推导有很多种,最简单的一种是基于加法模型,即弱学习者的线性组合。

为了最小化loss函数,我们有

以下是AdaBoost的算法流程:

代码示例


特征提取

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import numpy


class NPDFeature():
"""It is a tool class to extract the NPD features.

Attributes:
image: A two-dimension ndarray indicating grayscale image.
n_pixels: An integer indicating the number of image total pixels.
features: A one-dimension ndarray to store the extracted NPD features.
"""
__NPD_table__ = None

def __init__(self, image):
'''Initialize NPDFeature class with an image.'''
if NPDFeature.__NPD_table__ is None:
NPDFeature.__NPD_table__ = NPDFeature.__calculate_NPD_table()
assert isinstance(image, numpy.ndarray)
self.image = image.ravel()
self.n_pixels = image.size
self.features = numpy.empty(shape=self.n_pixels * (self.n_pixels - 1) // 2, dtype=float)

def extract(self):
'''Extract features from given image.

Returns:
A one-dimension ndarray to store the extracted NPD features.
'''
count = 0
for i in range(self.n_pixels - 1):
for j in range(i + 1, self.n_pixels, 1):
self.features[count] = NPDFeature.__NPD_table__[self.image[i]][self.image[j]]
count += 1
return self.features

@staticmethod
def __calculate_NPD_table():
'''Calculate all situations table to accelerate feature extracting.'''
print("Calculating the NPD table...")
table = numpy.empty(shape=(1 << 8, 1 << 8), dtype=float)
for i in range(1 << 8):
for j in range(1 << 8):
if i == 0 and j == 0:
table[i][j] = 0
else:
table[i][j] = (i - j) / (i + j)
return table
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import os
from PIL import Image
import matplotlib.pyplot as plt
import pickle
import cv2 as cv
import random
from feature import NPDFeature
from ensemble import AdaBoostClassifier

facePath = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\original\\face\\'
nonfacePath = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\original\\nonface\\'

face_data_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\face_data.txt'
nonface_data_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\datasets\\nonface_data.txt'

def dumpFeatures(path, label = 1):
img_names = os.listdir(path)
file = ''
if label == 1:
file = face_data_file
else:
file = nonface_data_file
write = open(file, 'wb')
feat = []
for name in img_names:
img_gray = cv.imread(path+name, cv.IMREAD_GRAYSCALE)
img_resize = cv.resize(img_gray, (24,24))
npd = NPDFeature(img_resize)
feat.append(npd.extract())
pickle.dump(feat, write)

def loadFeatures(filename):
read = open(filename, 'rb')
data = pickle.load(file=read)
return np.matrix(data)

def addLabel(data, label):
data = np.matrix(data)
return np.concatenate((np.full(shape=(data.shape[0],1),fill_value=label), data), axis=1)

def split_train_valid(data, fraction = 0.9):
data = np.matrix(data)
return sk.model_selection.train_test_split(data[:,0],data[:,1:data.shape[1]],train_size=fraction, test_size=1-fraction)

# dumpFeatures(facePath, label=1)
# dumpFeatures(nonfacePath, label=0)

face_data = loadFeatures(face_data_file)
nonface_data = loadFeatures(nonface_data_file)

face_data = addLabel(face_data,1)
nonface_data = addLabel(nonface_data,0)

print(face_data.shape)
print(nonface_data.shape)

data = np.concatenate((face_data, nonface_data), axis=0)

fraction = 0.9
X_train, X_valid, y_train, y_valid = sk.model_selection.train_test_split(data[:,1:data.shape[1]],data[:,0],train_size=fraction, test_size=1-fraction)

X_train = np.matrix(X_train)
X_valid = np.matrix(X_valid)
y_train = np.matrix(y_train)
y_valid = np.matrix(y_valid)

print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)
print(np.sum(y_train==1), np.sum(y_valid==1))

weight = np.full(shape=(X_train.shape[0]), fill_value=1/X_train.shape[0])
weak_classifier = DecisionTreeClassifier()

# weak_classifier.fit(X_train, y_train, sample_weight=weight)
# train_score = weak_classifier.score(X_train, y_train)
# valid_score = weak_classifier.score(X_valid, y_valid)

classifier = AdaBoostClassifier(weak_classifier, 10)
classifier.fit(X_train, y_train)
# hx = classifier.predict(X_valid)
train_score = classifier.predict_scores(X_train, y_train)
valid_score = classifier.predict_scores(X_valid, y_valid)

print('{}, {}'.format(train_score, valid_score))

hx = classifier.predict(X_valid)
print(hx)
report_file = 'F:\\VSCodeProject\\Python\\experiment\\ML2019-lab-03\\report.txt'
file_write = open(report_file, 'wb')
print(classification_report(y_valid, hx))

加性融合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import pickle
import numpy as np
from sklearn.tree import DecisionTreeClassifier


class AdaBoostClassifier:
'''A simple AdaBoost Classifier.'''

def __init__(self, weak_classifier, n_weakers_limit):
'''Initialize AdaBoostClassifier

Args:
weak_classifier: The class of weak classifier, which is recommend to be sklearn.tree.DecisionTreeClassifier.
n_weakers_limit: The maximum number of weak classifier the model can use.
'''
self.weak_classifier_list = []
self.classifier_weight = []
self.n_weakers_limit = n_weakers_limit
self.Hx = []
self.pred_list = []

def is_good_enough(self):
'''Optional'''
pass

def fit(self, X, y):
'''Build a boosted classifier from the training set (X, y).
Args:
X: An ndarray indicating the samples to be trained, which shape should be (n_samples,n_features).
y: An ndarray indicating the ground-truth labels correspond to X, which shape should be (n_samples,1).
'''

self.Hx = np.zeros((len(y), self.n_weakers_limit))
weight = np.full(shape=(X.shape[0]), fill_value=1/X.shape[0])
print(weight.shape)
for iters in range(self.n_weakers_limit):

weak_classifier = DecisionTreeClassifier(
criterion='entropy',
splitter='random',
max_features='log2',
max_depth=10,
max_leaf_nodes=10,
min_samples_split=10,
min_samples_leaf=3,
class_weight='balanced'
)

weak_classifier.fit(X, y, sample_weight=weight)
hx = weak_classifier.predict(X)
score = weak_classifier.score(X, y)

print('score {}: '.format(iters+1), score)

error = 1 - score
alpha = 0.5 * np.log((1-error)/error)

exp = np.exp(-alpha * np.multiply(hx, y.flatten()))
exp = np.array(exp).flatten()
weight = np.multiply(weight, exp)
zm = np.sum(weight)
weight = weight / zm

if(score>0.5):
self.classifier_weight.append(alpha)
self.weak_classifier_list.append(weak_classifier)
self.Hx[:,iters] = alpha*np.array(hx)
self.pred_list.append(np.array(hx))

def predict_scores(self, X, y):
'''Calculate the weighted sum score of the whole base classifiers for given samples.

Args:
X: An ndarray indicating the samples to be predicted, which shape should be (n_samples,n_features).

Returns:
An one-dimension ndarray indicating the scores of differnt samples, which shape should be (n_samples,1).
'''
Hx = np.zeros((len(y), self.n_weakers_limit))
for i in range(len(self.weak_classifier_list)):
hx = self.weak_classifier_list[i].predict(X)
Hx[:,i] = self.classifier_weight[i]*np.array(hx)
pred = np.sum(Hx, axis=1)
pred = (pred>=1)
accuracy = (pred==y.flatten())
return np.mean(accuracy)

def predict(self, X, threshold=0):
'''Predict the catagories for given samples.

Args:
X: An ndarray indicating the samples to be predicted, which shape should be (n_samples,n_features).
threshold: The demarcation number of deviding the samples into two parts.

Returns:
An ndarray consists of predicted labels, which shape should be (n_samples,1).
'''
Hx = np.zeros((X.shape[0], self.n_weakers_limit))
for i in range(len(self.weak_classifier_list)):
hx = self.weak_classifier_list[i].predict(X)
Hx[:,i] = self.classifier_weight[i]*np.array(hx)
pred = np.sum(Hx, axis=1)
pred[pred>=1] = 1
pred[pred<1] = 0
return pred

@staticmethod
def save(model, filename):
with open(filename, "wb") as f:
pickle.dump(model, f)

@staticmethod
def load(filename):
with open(filename, "rb") as f:
return pickle.load(f)

训练

1
2
3
if __name__ == "__main__":
# write your code here
pass

准确率

Author: WJZheng
Link: https://wellenzheng.github.io/2020/04/13/%E5%9F%BA%E4%BA%8EAdaBoost%E7%9A%84%E4%BA%BA%E8%84%B8%E8%AF%86%E5%88%AB%E7%AE%97%E6%B3%95/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.

Comment