I will post again on this project later to summarize everything that I learned, and hopefully clean up the code a bit now that I’m not under a time constraint to just get enough done to turn in! Also, now that I’ve submitted my work, any advice on the approach is welcome!
The 4 classification algorithms I wrote were:
- Naive Bayes
- Bayes
- Gaussian Kernel Density Estimator
- K-Nearest Neighbor
I’m really proud of myself for understanding these enough and ramping up quickly enough on Python to be able to finish in the week or so I had to work on it late-night. The code for the final two classifiers is below. It’s clear I still have a lot of learning to do, but I finished enough to get results to compare and turn in, so I’m happy with how far I got. Cleanup and improved code & efficiency can come later!
import numpy as np #bring in data from files data_train = np.loadtxt(open("train.txt")) data_test = np.loadtxt(open("test.txt")) Xtrn = data_train[:, 0:2] # first 2 columns of training set Ytrn = data_train[:, 2] # last column, 1/0 labels Xtst = data_test[:, 0:2] # first 2 columns of test set Ytst = data_test[:, 2] # last column, 1/0 labels print("Length of training set: %d " % len(Xtrn)) print("Length of test set: %d " % len(Xtst)) #items in class 0 Xtrn_0 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))] #note, currently hardcoded that each class is 1/2 of the total #loop through training data and find the items labeled 0 n = 0 for train_items in data_train: if train_items[2] == 0: #setting up new more general way so can do covariance matrix Xtrn_0[n][0] = train_items[0] Xtrn_0[n][1] = train_items[1] n=n+1 #items in class 1 Xtrn_1 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))] #loop through training data and find the items labeled 0 n = 0 for train_items in data_train: if train_items[2] == 1: #setting up new more general way so can do covariance matrix Xtrn_1[n][0] = train_items[0] Xtrn_1[n][1] = train_items[1] n=n+1 #set h for the GKDE "manual" estimation h = 1.6 prob0 = [] for x in range(len(Xtst)): sum0 = 0.0 for r in range(len(Xtrn_0)): sum0 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_0[r]) ** 2) / 2*h*h) prob0.append(sum0/len(Xtrn_0)) prob1 = [] for x in range(len(Xtst)): sum1 = 0.0 for r in range(len(Xtrn_1)): sum1 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_1[r]) ** 2) / 2*h*h) prob1.append(sum1/len(Xtrn_1)) Xprob_GKDE = [] for i in range(len(Xtst)): #assign class if prob0[i] > prob1[i]: Xprob_GKDE.append(0) else: Xprob_GKDE.append(1) #NOTE: this doesn't handle the case where they're equal. equal = 1 correct_class_new = 0 for i in range(len(Xtst)): if Xprob_GKDE[i] == Ytst[i]: correct_class_new += 1 print("Correct GKDE: %d" % correct_class_new) print("Incorrect GKDE: %d" % (len(Xtst) - correct_class_new))
import numpy as np import random #bring in data from files data_train = np.loadtxt(open("train.txt")) data_test = np.loadtxt(open("test.txt")) Xtrn = data_train[:, 0:2] # first 2 columns of training set Ytrn = data_train[:, 2] # last column, 1/0 labels Xtst = data_test[:, 0:2] # first 2 columns of test set Ytst = data_test[:, 2] # last column, 1/0 labels print("Length of training set: %d " % len(Xtrn)) print("Length of test set: %d " % len(Xtst)) #count items in each class class0_count = 0 class1_count = 0 for train_items in data_train: if train_items[2] == 0: class0_count +=1 elif train_items[2] == 1: class1_count +=1 print("Training points in Class 0: %d" % class0_count) print("Training points in Class 1: %d" % class1_count) #probability of each class pc0 = class0_count / len(Xtrn) pc1 = class1_count / len(Xtrn) #for each point in the test set, loop through each point in the training set, and find the 3 nearest k=11 print("%d Nearest Neighbors" %k) xClass = [] for i in range(len(Xtst)): Distances = [] KClass0 = 0 KClass1 = 0 pXc0 = 0 pXc1 = 0 pX = 0 for j in range(len(Xtrn)): #get the distance of each item in the training set from this item in the test set Distances.append((j,np.linalg.norm(Xtst[i] - Xtrn[j]))) #index to store for when set is sorted, distance between vectors #print(i) #sort those distances Distances.sort(key = lambda tup: tup[1]) #print(Distances) #Kth training item distance from this test item KVol = 4 * np.pi * np.square(Distances[k-1][1]) #print(KVol) for m in range(k-1): #get the class of each of the k nearest neighbors by index of mth sorted item if Ytrn[Distances[m][0]] == 0: KClass0 +=1 if Ytrn[Distances[m][0]] == 1: KClass1 +=1 pXc0 = KClass0 / (class0_count * KVol) pXc1 = KClass1 / (class1_count * KVol) pX = k / (len(Xtrn)*KVol) pc0X = (pXc0 * pc0) / pX pc1X = (pXc1 * pc1) / pX #print("Probability Class 0 for Point %d: %f" %(i,pc0X)) #print("Probability Class 1 for Point %d: %f" %(i,pc1X)) if pc0X > pc1X: xClass.append(0) #probability of class 0 is higher, assign to class 0 elif pc0X < pc1X: xClass.append(1) else: #if both probabilities are equal, assign randomly xClass.append(random.randint(0, 1)) #print("Class: %d" % xClass[i]) correct_class = 0 for i in range(len(Xtst)): if xClass[i] == Ytst[i]: correct_class += 1 print("Correct Class: %d" % correct_class) print("Incorrect Class: %d" % (len(Xtst) - correct_class))