I present to you… my first classifier! Naive Bayes! It appears to work! haha :)
I know it’s a mess, but I have barely used Python before, and I’m new to Machine Learning, so I’m learning. This is for #2 for my project.
import numpy as np #bring in data from files data_train = np.loadtxt(open("train.txt")) data_test = np.loadtxt(open("test.txt")) #can't define 3rd column as integer, or it will collapse to 1D array and next line won't work? Xtrn = data_train[:, 0:2] # first 2 columns of training set Ytrn = data_train[:, 2] # last column, 1/0 labels Xtst = data_test[:, 0:2] # first 2 columns of test set Ytst = data_test[:, 2] # last column, 1/0 labels print("Length of training set: %d " % len(Xtrn)) print("Length of test set: %d " % len(Xtst)) #mean of each x in Class 0, and covariance #initialize Xtrn_0_1 = [] Xtrn_0_2 = [] #loop through training data and find the items labeled 0 for train_items in data_train: if train_items[2] == 0: Xtrn_0_1.append(train_items[0]) # first column of training set Xtrn_0_2.append(train_items[1]) # second column of training set print("\nNumber of Class 0 items in training set: %d " % len(Xtrn_0_1)) train0_col1_mean = np.mean(Xtrn_0_1) train0_col2_mean = np.mean(Xtrn_0_2) #train0_cov = np.cov(Xtrn_0_1,Xtrn_0_2)[0][1] #calculating variance & covariance without numpy sum0_1 = 0 sum0_2 = 0 #add up (x-m)^2 for i in range(0, len(Xtrn_0_1)): #to get col1 variance sum0_1 += ((Xtrn_0_1[i] - train0_col1_mean) * (Xtrn_0_1[i] - train0_col1_mean)) #to get col2 variance sum0_2 += ((Xtrn_0_2[i] - train0_col2_mean) * (Xtrn_0_2[i] - train0_col2_mean)) #don't need covariance for Naive Bayes #cov0 = sum0/(len(Xtrn_0_1)-1) #variance of each column var0_1 = sum0_1/(len(Xtrn_0_1)-1) var0_2 = sum0_2/(len(Xtrn_0_2)-1) print("Class 0 Column 1 mean: %f, variance %f " % (train0_col1_mean, var0_1)) print("Class 0 Column 2 mean: %f, variance %f " % (train0_col2_mean, var0_2)) #mean of each x in Class 1, and covariance Xtrn_1_1 = [] Xtrn_1_2 = [] for train_items in data_train: if train_items[2] == 1: Xtrn_1_1.append(train_items[0]) # first column of training set Xtrn_1_2.append(train_items[1]) # second column of training set print("\nNumber of Class 1 items in training set: %d " % len(Xtrn_1_1)) train1_col1_mean = np.mean(Xtrn_1_1) train1_col2_mean = np.mean(Xtrn_1_2) sum1 = 0 sum1_1 = 0 sum1_2 = 0 #add up (x-m)^2 for i in range(0, len(Xtrn_1_1)): #to get col1 variance sum1_1 += ((Xtrn_1_1[i] - train1_col1_mean) * (Xtrn_1_1[i] - train1_col1_mean)) #to get col2 variance sum1_2 += ((Xtrn_1_2[i] - train1_col2_mean) * (Xtrn_1_2[i] - train1_col2_mean)) var1_1 = sum1_1/(len(Xtrn_1_1)-1) var1_2 = sum1_2/(len(Xtrn_1_1)-1) print("Class 1 Column 1 mean: %f, variance %f " % (train1_col1_mean, var1_1)) print("Class 1 Column 2 mean: %f, variance %f " % (train1_col2_mean, var1_2)) #FOR TRAINING DATA - CLASSIFY #3-column list the length of the training dataset Xprob = [[0 for x in range(3)] for y in range(len(Xtrn))] count_correct = 0 for i in range(0, len(Xtrn)): #note - this loop assumes all of the lists are the same length! - may need to check later #probability x1 given class 1 Px1_1 = (1 / (np.sqrt(var1_1)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,0]-train1_col1_mean) / (2*var1_1) ) ) #probability x2 given class 1 Px1_2 = (1 / (np.sqrt(var1_2)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,1]-train1_col2_mean) / (2*var1_2) ) ) #p(c0)*p(x1|c0)*p(x2|c0) Px1 = (Px1_1 * Px1_2)*(len(Xtrn_1_1)/len(Xtrn)) Xprob[i][1] = Px1 #print("Probability of X%d being in Class 1 = %f " % (i,Px1)) #probability x1 given class 0 Px0_1 = (1 / (np.sqrt(var0_1)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,0]-train0_col1_mean) / (2*var0_1) ) ) #probability x2 given class 0 Px0_2 = (1 / (np.sqrt(var0_2)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,1]-train0_col2_mean) / (2*var0_2) ) ) #p(c0)*p(x1|c0)*p(x2|c0) Px0 = (Px0_1 * Px0_2)*(len(Xtrn_0_1)/len(Xtrn)) Xprob[i][0] = Px0 #print("Probability of X%d being in Class 0 = %f " % (i,Px0)) #if the probability of being in class 0 is greater than the probability of being in class 1, assign class 0 (etc.) if Xprob[i][0] > Xprob[i][1]: Xprob[i][2] = 0 elif Xprob[i][0] < Xprob[i][1]: Xprob[i][2] = 1 else: Xprob[i][2] = random.randint(0, 1) #if both probabilities are equal, randomly assign #print("X%d is predicted to be in Class %d and is actually in Class %d \n" % (i,Xprob[i][2],Ytrn[i])) if Xprob[i][2] == Ytrn[i]: count_correct += 1 print("\nTraining Set: %d correct classifications, %d incorrect." % (count_correct, len(Xtrn)-count_correct)) #FOR TEST DATA Xprob_tst = [[0 for x in range(3)] for y in range(len(Xtst))] count_correct = 0 for i in range(0, len(Xtst)): #probability x1 given class 1 Px1_1 = (1 / (np.sqrt(var1_1)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,0]-train1_col1_mean) / (2*var1_1) ) ) #probability x2 given class 1 Px1_2 = (1 / (np.sqrt(var1_2)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,1]-train1_col2_mean) / (2*var1_2) ) ) #p(c0)*p(x1|c0)*p(x2|c0) Px1 = (Px1_1 * Px1_2)*(len(Xtrn_1_1)/len(Xtrn)) Xprob_tst[i][1] = Px1 #print("Probability of X%d being in Class 1 = %f " % (i,Px1)) #probability x1 given class 0 Px0_1 = (1 / (np.sqrt(var0_1)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,0]-train0_col1_mean) / (2*var0_1) ) ) #probability x2 given class 0 Px0_2 = (1 / (np.sqrt(var0_2)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,1]-train0_col2_mean) / (2*var0_2) ) ) #p(c0)*p(x1|c0)*p(x2|c0) Px0 = (Px0_1 * Px0_2)*(len(Xtrn_0_1)/len(Xtrn)) Xprob_tst[i][0] = Px0 #print("Probability of X%d being in Class 0 = %f " % (i,Px0)) if Xprob_tst[i][0] > Xprob_tst[i][1]: Xprob_tst[i][2] = 0 elif Xprob_tst[i][0] < Xprob_tst[i][1]: Xprob_tst[i][2] = 1 else: Xprob_tst[i][2] = random.randint(0, 1) #if both probabilities are equal, randomly assign #print("X%d is predicted to be in Class %d and is actually in Class %d \n" % (i,Xprob_tst[i][2],Ytst[i])) if Xprob_tst[i][2] == Ytst[i]: count_correct += 1 print("Test Set: %d correct classifications, %d incorrect." % (count_correct, len(Xtst)-count_correct))
Output:
Length of training set: 400
Length of test set: 400Number of Class 0 items in training set: 200
Class 0 Column 1 mean: 0.120442, variance 0.902375
Class 0 Column 2 mean: 0.136034, variance 8.359462Number of Class 1 items in training set: 200
Class 1 Column 1 mean: 2.043920, variance 0.905506
Class 1 Column 2 mean: 1.998865, variance 8.778899Training Set: 342 correct classifications, 58 incorrect.
Test Set: 346 correct classifications, 54 incorrect.