1️⃣ 任务要求
1️⃣.1️⃣ 实现5个函数,分别为:
load_data()
:读取数据,并转换为可用的形式;split_data()
:将数据集分为训练集和测试集;train()
:从当前数据集中训练模型;predict()
:用train()
生成的模型,对测试集的学生进行分班;evaluate()
:输出模型的准确率。
1️⃣.2️⃣ train()
和predict()
不可以用第三方库;
1️⃣.3️⃣ 数据集(下载链接:student.csv)
- 每个学生对应的情况,与最终分到的班级;
- 649行数据(
instances
); - 30个类别性特征;
- 6个班级,包括{A+,A,B,C,D,F};
1️⃣.4️⃣ 数据集解释:
1 school - students school (binary: "GP" - Gabriel Pereira or "MS" - Mousinho da Silveira)
2 sex - students sex (binary: "F" - female or "M" - male)
3 address - students home address type (binary: "U" - urban or "R" - rural)
4 famsize - family size (binary: "LE3" - less or equal to 3 or "GT3" - greater than 3)
5 Pstatus - parents cohabitation status (binary: "T" - living together or "A" - apart)
6 Medu - mothers education (nominal: low, none, mid, high)
7 Fedu - fathers education (nominal: low, none, mid, high)
8 Mjob - mothers job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
9 Fjob - fathers job (nominal: "teacher", "health" care related, civil "services" (e.g. administrative or police), "at_home" or "other")
10 reason - reason to choose this school (nominal: close to "home", school "reputation", "course" preference or "other")
11 guardian - students guardian (nominal: "mother", "father" or "other")
12 traveltime - home to school travel time (nominal: none, low, medium, high, very_high)
13 studytime - weekly study time (nominal: none, low, medium, high, very_high)
14 failures - number of past class failures (nominal: none, low, medium, high, very_high)
15 schoolsup - extra educational support (binary: yes or no)
16 famsup - family educational support (binary: yes or no)
17 paid - extra paid classes within the course subject (binary: yes or no)
18 activities - extra-curricular activities (binary: yes or no)
19 nursery - attended nursery school (binary: yes or no)
20 higher - wants to take higher education (binary: yes or no)
21 internet - Internet access at home (binary: yes or no)
22 romantic - with a romantic relationship (binary: yes or no)
23 famrel - quality of family relationships (nominal: very_bad, bad, mediocre, good, excellent)
24 freetime - free time after school (nominal: very_low, low, mediocre, high, very_high)
25 goout - going out with friends (nominal: very_low, low, mediocre, high, very_high)
26 Dalc - workday alcohol consumption (nominal: very_low, low, mediocre, high, very_high)
27 Walc - weekend alcohol consumption (nominal: very_low, low, mediocre, high, very_high)
28 health - current health status (nominal: very_bad, bad, mediocre, good, excellent)
29 absences - number of school absences (nominal: none, one_to_three, four_to_six, seven_to_ten, more_than_ten)
30 Grade - final grade (A+, A, B, C, D, F)
2️⃣ 代码
2️⃣.1️⃣ load_data()
# This function should open a data file in csv, and transform it into a usable format
def load_data():
import pandas as pd
data = pd.read_csv('student.csv', sep=',')
return data
2️⃣.2️⃣ split_data()
# This function should split a data set into a training set and hold-out test set
def split_data(data, test_size):
"""
split the data into train set and test set
:param data: Dtype from pd.read_csv
:param test_size: float, define the position to split
:return:
"""
import numpy as np
X = data[list(data.columns[:-1])].values # get the instances matrix
y = data['Grade'] # get the class vector
index = np.arange(data.shape[0]) # get the number of the dataset
np.random.shuffle(index) # shuffle the order of the data
X = X[index] # reorder the instances matrix
y = y[index] # reorder the class vector
split_point = int(X.shape[0] * test_size) # define the position to split the data into train and test
X_train, X_test = X[:split_point], X[split_point:]
y_train, y_test = y[:split_point], y[split_point:]
return X_train, X_test, y_train, y_test
2️⃣.3️⃣ train()
# This function should build a supervised NB model
def train(X, y, alpha):
"""
train or generate the probability matrix of Naive Bayes Classifier
:param X: Dtype from pd.read_csv, train set
:param y: Dtype from pd.read_csv, train class
:param alpha: Laplace smooth index
:return:
"""
y_class_count = {}
feature_dimension = len(X[1]) # number of feature
# get the number of each labels
for c in y:
y_class_count[c] = y_class_count.get(c, 0) + 1 # generate the dict of class, e.g. {'A':'69',...}
y_class_tuple = sorted(y_class_count.items(), reverse=False) # generate the tuple of class and sort it in terms of number, e.g. [('A','69'),...]
K = len(y_class_tuple) # the specific number of class grade
N = len(y) # the number of instances
# get the prior probability
prior_prob = {}
for key in range(len(y_class_tuple)):
prior_prob[y_class_tuple[key][0]] = (y_class_tuple[key][1] + alpha) / (N + K * alpha) # laplace smooth
# get the value set of each feature
feature_value = [] # feature with different value
feature_value_number = [] # the number of unique values of each feature
for feature in range(feature_dimension):
unique_feature = list(set(X[:, feature])) # use `set` to get the unique value
feature_value_number.append(len(unique_feature))
feature_value.append(unique_feature)
# calculate the conditional probability
conditional_prob = []
# calculate the count (x = a & y = c)
for j in range(feature_dimension):
count = [[0 for i in range(len(y_class_count))] for i in range(feature_value_number[j])]
# use list comprehension to generate zero matrix, (feature_value_number[j] rows x y_class_count cols)
for i in range(len(X[:, j])):
for k in range(len(feature_value[j])):
for t in range(len(y_class_count)):
if X[:, j][i] == feature_value[j][k] and list(y)[i] == y_class_tuple[t][0]:
# x = value and y = class, get the count
count[k][t] += 1
# calculate the conditional probability
for m in range(len(y_class_tuple)):
for r in range(len(count)):
count[r][m] = (count[r][m] + alpha) / (y_class_tuple[m][1] + alpha * feature_value_number[j]) # laplace smoothing
conditional_prob.append(count)
return y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob
2️⃣.4️⃣ predict()
def classify(y_class_tuple, prior_prob, feature_value, conditional_prob, feature_value_number, alpha, instance):
"""
generate the answer of classification
:param y_class_tuple: list, the tuple of class and sort it in terms of number
:param prior_prob: float list, prior probability of class
:param feature_value: list, feature value of all the attributes
:param conditional_prob: float list, posterior probability
:param feature_value_number: float list, number of different unique features
:param alpha: float, Laplace smooth index default 1
:param instance: list, one row of test set
:return:
"""
import math
predict = {}
for m in range(len(y_class_tuple)):
# get the prior_probability of m-th label in y_class_tuple
yhat = math.log(prior_prob[y_class_tuple[m][0]]) # use log-transformation to avoid float missing
for n in range(len(instance)):
if instance[n] in feature_value[n]:
index = feature_value[n].index(instance[n]) # locate the feature in feature_value
yhat = yhat + math.log(conditional_prob[n][index][m]) # accumulate the probability
else:
# if the value of feature is not in training set, return the laplace smoothing
yhat = alpha / (feature_value_number[n] * alpha)
predict[y_class_tuple[m][0]] = yhat
return predict
# This function should predict the class for an instance or a set of instances, based on a trained model
def predict(y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob, X, alpha, flag=0):
"""
predict the class for an instance or a set of instances, based on a trained model
:param y_class_tuple: list, the tuple of class and sort it in terms of number
:param prior_prob: float list, prior probability of class
:param feature_value: list, feature value of all the attributes
:param conditional_prob: float list, posterior probability
:param feature_value_number: float list, number of different unique features
:param alpha: float, Laplace smooth index default 1
:param X: Dtype from pd.read_csv, test set
:param flag: set 1 return probability or set 0 return prediction, default 0
:return:
"""
import operator as op
test_num = len(X)
prediction = [0 for i in range(test_num)]
probability = [0 for i in range(test_num)]
for i in range(test_num):
result = classify(y_class_tuple, prior_prob, feature_value, conditional_prob, feature_value_number, 1, X[i, :])
# result is the probability of each class
result = sorted(result.items(), key=op.itemgetter(1), reverse=True) # the max probability is the predict class
prediction[i] = result[0][0] # show the predict answer
probability[i] = result[0][1] # show the predict probability
if flag:
return probability
else:
return prediction
2️⃣.5️⃣ evaluate()
# This function should evaluate a set of predictions in terms of accuracy
def evaluate(p, y_test):
accuracy = sum(p == y_test)/len(y_test)
return accuracy
2️⃣.6️⃣ 主函数
data = load_data()
X_train, X_test, y_train, y_test = split_data(data, 0.7)
y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob = train(X_train, y_train, 1)
p = predict(y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob, X_test, 1)
evaluate(p, y_test)
3️⃣ 整合全部代码(方便大家复制后直接运行)
# This function should open a data file in csv, and transform it into a usable format
def load_data():
import pandas as pd
data = pd.read_csv('student.csv', sep=',')
return data
# This function should split a data set into a training set and hold-out test set
def split_data(data, test_size):
"""
split the data into train set and test set
:param data: Dtype from pd.read_csv
:param test_size: float, define the position to split
:return:
"""
import numpy as np
X = data[list(data.columns[:-1])].values # get the instances matrix
y = data['Grade'] # get the class vector
index = np.arange(data.shape[0]) # get the number of the dataset
np.random.shuffle(index) # shuffle the order of the data
X = X[index] # reorder the instances matrix
y = y[index] # reorder the class vector
split_point = int(X.shape[0] * test_size) # define the position to split the data into train and test
X_train, X_test = X[:split_point], X[split_point:]
y_train, y_test = y[:split_point], y[split_point:]
return X_train, X_test, y_train, y_test
# This function should build a supervised NB model
def train(X, y, alpha):
"""
train or generate the probability matrix of Naive Bayes Classifier
:param X: Dtype from pd.read_csv, train set
:param y: Dtype from pd.read_csv, train class
:param alpha: Laplace smooth index
:return:
"""
y_class_count = {}
feature_dimension = len(X[1]) # number of feature
# get the number of each labels
for c in y:
y_class_count[c] = y_class_count.get(c, 0) + 1 # generate the dict of class, e.g. {'A':'69',...}
y_class_tuple = sorted(y_class_count.items(), reverse=False) # generate the tuple of class and sort it in terms of number, e.g. [('A','69'),...]
K = len(y_class_tuple) # the specific number of class grade
N = len(y) # the number of instances
# get the prior probability
prior_prob = {}
for key in range(len(y_class_tuple)):
prior_prob[y_class_tuple[key][0]] = (y_class_tuple[key][1] + alpha) / (N + K * alpha) # laplace smooth
# get the value set of each feature
feature_value = [] # feature with different value
feature_value_number = [] # the number of unique values of each feature
for feature in range(feature_dimension):
unique_feature = list(set(X[:, feature])) # use `set` to get the unique value
feature_value_number.append(len(unique_feature))
feature_value.append(unique_feature)
# calculate the conditional probability
conditional_prob = []
# calculate the count (x = a & y = c)
for j in range(feature_dimension):
count = [[0 for i in range(len(y_class_count))] for i in range(feature_value_number[j])]
# use list comprehension to generate zero matrix, (feature_value_number[j] rows x y_class_count cols)
for i in range(len(X[:, j])):
for k in range(len(feature_value[j])):
for t in range(len(y_class_count)):
if X[:, j][i] == feature_value[j][k] and list(y)[i] == y_class_tuple[t][0]:
# x = value and y = class, get the count
count[k][t] += 1
# calculate the conditional probability
for m in range(len(y_class_tuple)):
for r in range(len(count)):
count[r][m] = (count[r][m] + alpha) / (y_class_tuple[m][1] + alpha * feature_value_number[j]) # laplace smoothing
conditional_prob.append(count)
return y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob
def classify(y_class_tuple, prior_prob, feature_value, conditional_prob, feature_value_number, alpha, instance):
"""
generate the answer of classification
:param y_class_tuple: list, the tuple of class and sort it in terms of number
:param prior_prob: float list, prior probability of class
:param feature_value: list, feature value of all the attributes
:param conditional_prob: float list, posterior probability
:param feature_value_number: float list, number of different unique features
:param alpha: float, Laplace smooth index default 1
:param instance: list, one row of test set
:return:
"""
import math
predict = {}
for m in range(len(y_class_tuple)):
# get the prior_probability of m-th label in y_class_tuple
yhat = math.log(prior_prob[y_class_tuple[m][0]]) # use log-transformation to avoid float missing
for n in range(len(instance)):
if instance[n] in feature_value[n]:
index = feature_value[n].index(instance[n]) # locate the feature in feature_value
yhat = yhat + math.log(conditional_prob[n][index][m]) # accumulate the probability
else:
# if the value of feature is not in training set, return the laplace smoothing
yhat = alpha / (feature_value_number[n] * alpha)
predict[y_class_tuple[m][0]] = yhat
return predict
# This function should predict the class for an instance or a set of instances, based on a trained model
def predict(y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob, X, alpha, flag=0):
"""
predict the class for an instance or a set of instances, based on a trained model
:param y_class_tuple: list, the tuple of class and sort it in terms of number
:param prior_prob: float list, prior probability of class
:param feature_value: list, feature value of all the attributes
:param conditional_prob: float list, posterior probability
:param feature_value_number: float list, number of different unique features
:param alpha: float, Laplace smooth index default 1
:param X: Dtype from pd.read_csv, test set
:param flag: set 1 return probability or set 0 return prediction, default 0
:return:
"""
import operator as op
test_num = len(X)
prediction = [0 for i in range(test_num)]
probability = [0 for i in range(test_num)]
for i in range(test_num):
result = classify(y_class_tuple, prior_prob, feature_value, conditional_prob, feature_value_number, 1, X[i, :])
# result is the probability of each class
result = sorted(result.items(), key=op.itemgetter(1), reverse=True) # the max probability is the predict class
prediction[i] = result[0][0] # show the predict answer
probability[i] = result[0][1] # show the predict probability
if flag:
return probability
else:
return prediction
# This function should evaluate a set of predictions in terms of accuracy
def evaluate(p, y_test):
accuracy = sum(p == y_test)/len(y_test)
return accuracy
data = load_data()
X_train, X_test, y_train, y_test = split_data(data, 0.7)
y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob = train(X_train, y_train, 1)
p = predict(y_class_tuple, prior_prob, feature_value, feature_value_number, conditional_prob, X_test, 1)
evaluate(p, y_test)
0.4358974358974359
评论区