ML Project 1 (Post 6) – Becoming A Data Scientist

I will post again on this project later to summarize everything that I learned, and hopefully clean up the code a bit now that I’m not under a time constraint to just get enough done to turn in! Also, now that I’ve submitted my work, any advice on the approach is welcome!

The 4 classification algorithms I wrote were:

Naive Bayes
Bayes
Gaussian Kernel Density Estimator
K-Nearest Neighbor

I’m really proud of myself for understanding these enough and ramping up quickly enough on Python to be able to finish in the week or so I had to work on it late-night. The code for the final two classifiers is below. It’s clear I still have a lot of learning to do, but I finished enough to get results to compare and turn in, so I’m happy with how far I got. Cleanup and improved code & efficiency can come later!

import numpy as np

#bring in data from files
data_train = np.loadtxt(open("train.txt"))
data_test = np.loadtxt(open("test.txt"))
Xtrn = data_train[:, 0:2]  # first 2 columns of training set
Ytrn = data_train[:, 2]  # last column, 1/0 labels
Xtst = data_test[:, 0:2]  # first 2 columns of test set
Ytst = data_test[:, 2]  # last column, 1/0 labels
print("Length of training set: %d " % len(Xtrn))
print("Length of test set: %d " % len(Xtst))

#items in class 0
Xtrn_0 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))] #note, currently hardcoded that each class is 1/2 of the total
#loop through training data and find the items labeled 0
n = 0
for train_items in data_train:
    if train_items[2] == 0:
        #setting up new more general way so can do covariance matrix
        Xtrn_0[n][0] = train_items[0]
        Xtrn_0[n][1] = train_items[1]
        n=n+1
#items in class 1
Xtrn_1 = [[0 for x in range(2)] for y in range(int(len(Xtrn)/2))] 
#loop through training data and find the items labeled 0
n = 0
for train_items in data_train:
    if train_items[2] == 1:
        #setting up new more general way so can do covariance matrix
        Xtrn_1[n][0] = train_items[0]
        Xtrn_1[n][1] = train_items[1]
        n=n+1


#set h for the GKDE "manual" estimation
h = 1.6

prob0 = []
for x in range(len(Xtst)):
    sum0 = 0.0
    for r in range(len(Xtrn_0)):
        sum0 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_0[r]) ** 2) / 2*h*h)
    prob0.append(sum0/len(Xtrn_0))


prob1 = []
for x in range(len(Xtst)):
    sum1 = 0.0
    for r in range(len(Xtrn_1)):
        sum1 += (1/((2*np.pi) ** (2/2)))*np.exp(-1*(np.linalg.norm(Xtst[x] - Xtrn_1[r]) ** 2) / 2*h*h)
    prob1.append(sum1/len(Xtrn_1))


Xprob_GKDE = []
for i in range(len(Xtst)):
   #assign class
   if prob0[i] > prob1[i]:
       Xprob_GKDE.append(0)
   else:
       Xprob_GKDE.append(1)  #NOTE: this doesn't handle the case where they're equal. equal = 1

correct_class_new = 0
for i in range(len(Xtst)):
   if Xprob_GKDE[i] == Ytst[i]:
       correct_class_new += 1

print("Correct GKDE: %d" % correct_class_new)
print("Incorrect GKDE: %d" % (len(Xtst) - correct_class_new))

import numpy as np
import random

#bring in data from files
data_train = np.loadtxt(open("train.txt"))
data_test = np.loadtxt(open("test.txt"))
Xtrn = data_train[:, 0:2]  # first 2 columns of training set
Ytrn = data_train[:, 2]  # last column, 1/0 labels
Xtst = data_test[:, 0:2]  # first 2 columns of test set
Ytst = data_test[:, 2]  # last column, 1/0 labels
print("Length of training set: %d " % len(Xtrn))
print("Length of test set: %d " % len(Xtst))


#count items in each class
class0_count = 0
class1_count = 0
for train_items in data_train:
    if train_items[2] == 0:
        class0_count +=1
    elif train_items[2] == 1:
        class1_count +=1

print("Training points in Class 0: %d" % class0_count)
print("Training points in Class 1: %d" % class1_count)

#probability of each class
pc0 = class0_count / len(Xtrn)
pc1 = class1_count / len(Xtrn)

#for each point in the test set, loop through each point in the training set, and find the 3 nearest
k=11
print("%d Nearest Neighbors" %k)
xClass = []
for i in range(len(Xtst)):
    Distances = []
    KClass0 = 0
    KClass1 = 0
    pXc0 = 0
    pXc1 = 0
    pX = 0
    for j in range(len(Xtrn)):
        #get the distance of each item in the training set from this item in the test set
        Distances.append((j,np.linalg.norm(Xtst[i] - Xtrn[j]))) #index to store for when set is sorted, distance between vectors 

    #print(i)
    #sort those distances
    Distances.sort(key = lambda tup: tup[1])
    #print(Distances)
    #Kth training item distance from this test item
    KVol = 4 * np.pi * np.square(Distances[k-1][1])
    #print(KVol)
    for m in range(k-1):
        #get the class of each of the k nearest neighbors by index of mth sorted item
        if Ytrn[Distances[m][0]] == 0:
            KClass0 +=1
        if Ytrn[Distances[m][0]] == 1:
            KClass1 +=1
    pXc0 = KClass0 / (class0_count * KVol)
    pXc1 = KClass1 / (class1_count * KVol)
    pX = k / (len(Xtrn)*KVol)
    pc0X = (pXc0 * pc0) / pX
    pc1X = (pXc1 * pc1) / pX
    #print("Probability Class 0 for Point %d: %f" %(i,pc0X))
    #print("Probability Class 1 for Point %d: %f" %(i,pc1X))
    if pc0X > pc1X:
        xClass.append(0) #probability of class 0 is higher, assign to class 0
    elif pc0X < pc1X:
        xClass.append(1)
    else:
        #if both probabilities are equal, assign randomly
        xClass.append(random.randint(0, 1))
        
    #print("Class: %d" % xClass[i])

correct_class = 0
for i in range(len(Xtst)):
   if xClass[i] == Ytst[i]:
       correct_class += 1

print("Correct Class: %d" % correct_class)
print("Incorrect Class: %d" % (len(Xtst) - correct_class))