ML Project 1 (Post 3) – Becoming A Data Scientist

I present to you… my first classifier! Naive Bayes! It appears to work! haha :)

I know it’s a mess, but I have barely used Python before, and I’m new to Machine Learning, so I’m learning. This is for #2 for my project.

import numpy as np

#bring in data from files
data_train = np.loadtxt(open("train.txt"))
data_test = np.loadtxt(open("test.txt"))
#can't define 3rd column as integer, or it will collapse to 1D array and next line won't work?
Xtrn = data_train[:, 0:2]  # first 2 columns of training set
Ytrn = data_train[:, 2]  # last column, 1/0 labels
Xtst = data_test[:, 0:2]  # first 2 columns of test set
Ytst = data_test[:, 2]  # last column, 1/0 labels
print("Length of training set: %d " % len(Xtrn))
print("Length of test set: %d " % len(Xtst))

#mean of each x in Class 0, and covariance
#initialize
Xtrn_0_1 = []
Xtrn_0_2 = []
#loop through training data and find the items labeled 0
for train_items in data_train:
    if train_items[2] == 0:
        Xtrn_0_1.append(train_items[0])  # first column of training set
        Xtrn_0_2.append(train_items[1])  # second column of training set
print("\nNumber of Class 0 items in training set: %d " % len(Xtrn_0_1))
train0_col1_mean = np.mean(Xtrn_0_1)
train0_col2_mean = np.mean(Xtrn_0_2)
#train0_cov = np.cov(Xtrn_0_1,Xtrn_0_2)[0][1]
#calculating variance & covariance without numpy 
sum0_1 = 0
sum0_2 = 0
#add up (x-m)^2
for i in range(0, len(Xtrn_0_1)):
        #to get col1 variance
        sum0_1 += ((Xtrn_0_1[i] - train0_col1_mean) * (Xtrn_0_1[i] - train0_col1_mean))
        #to get col2 variance
        sum0_2 += ((Xtrn_0_2[i] - train0_col2_mean) * (Xtrn_0_2[i] - train0_col2_mean))
#don't need covariance for Naive Bayes
#cov0 = sum0/(len(Xtrn_0_1)-1)
#variance of each column
var0_1 = sum0_1/(len(Xtrn_0_1)-1)
var0_2 = sum0_2/(len(Xtrn_0_2)-1)
print("Class 0 Column 1 mean: %f, variance %f " % (train0_col1_mean, var0_1))
print("Class 0 Column 2 mean: %f, variance %f " % (train0_col2_mean, var0_2))

#mean of each x in Class 1, and covariance
Xtrn_1_1 = []
Xtrn_1_2 = []
for train_items in data_train:
    if train_items[2] == 1:
        Xtrn_1_1.append(train_items[0])  # first column of training set
        Xtrn_1_2.append(train_items[1])  # second column of training set
print("\nNumber of Class 1 items in training set: %d " % len(Xtrn_1_1))
train1_col1_mean = np.mean(Xtrn_1_1)
train1_col2_mean = np.mean(Xtrn_1_2)
sum1 = 0
sum1_1 = 0
sum1_2 = 0
#add up (x-m)^2
for i in range(0, len(Xtrn_1_1)):
        #to get col1 variance
        sum1_1 += ((Xtrn_1_1[i] - train1_col1_mean) * (Xtrn_1_1[i] - train1_col1_mean))
        #to get col2 variance
        sum1_2 += ((Xtrn_1_2[i] - train1_col2_mean) * (Xtrn_1_2[i] - train1_col2_mean))
var1_1 = sum1_1/(len(Xtrn_1_1)-1)
var1_2 = sum1_2/(len(Xtrn_1_1)-1)
print("Class 1 Column 1 mean: %f, variance %f " % (train1_col1_mean, var1_1))
print("Class 1 Column 2 mean: %f, variance %f " % (train1_col2_mean, var1_2))

#FOR TRAINING DATA - CLASSIFY
#3-column list the length of the training dataset
Xprob = [[0 for x in range(3)] for y in range(len(Xtrn))]
count_correct = 0
for i in range(0, len(Xtrn)):
    #note - this loop assumes all of the lists are the same length! - may need to check later
    #probability x1 given class 1
    Px1_1 = (1 / (np.sqrt(var1_1)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,0]-train1_col1_mean) / (2*var1_1) ) )
    #probability x2 given class 1
    Px1_2 = (1 / (np.sqrt(var1_2)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,1]-train1_col2_mean) / (2*var1_2) ) )
    #p(c0)*p(x1|c0)*p(x2|c0)
    Px1 = (Px1_1 * Px1_2)*(len(Xtrn_1_1)/len(Xtrn))
    Xprob[i][1] = Px1
    #print("Probability of X%d being in Class 1 = %f " % (i,Px1))
     #probability x1 given class 0
    Px0_1 = (1 / (np.sqrt(var0_1)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,0]-train0_col1_mean) / (2*var0_1) ) )
    #probability x2 given class 0
    Px0_2 = (1 / (np.sqrt(var0_2)*2*np.pi)) * np.exp( (-1*np.square(Xtrn[i,1]-train0_col2_mean) / (2*var0_2) ) )
    #p(c0)*p(x1|c0)*p(x2|c0)
    Px0 = (Px0_1 * Px0_2)*(len(Xtrn_0_1)/len(Xtrn))
    Xprob[i][0] = Px0
    #print("Probability of X%d being in Class 0 = %f " % (i,Px0))
    #if the probability of being in class 0 is greater than the probability of being in class 1, assign class 0 (etc.)
    if Xprob[i][0] > Xprob[i][1]:
        Xprob[i][2] = 0
    elif Xprob[i][0] < Xprob[i][1]:
        Xprob[i][2] = 1 
    else:
        Xprob[i][2] = random.randint(0, 1) #if both probabilities are equal, randomly assign
    #print("X%d is predicted to be in Class %d and is actually in Class %d \n" % (i,Xprob[i][2],Ytrn[i]))
    if Xprob[i][2] == Ytrn[i]:
        count_correct += 1

print("\nTraining Set: %d correct classifications, %d incorrect." % (count_correct, len(Xtrn)-count_correct))

#FOR TEST DATA
Xprob_tst = [[0 for x in range(3)] for y in range(len(Xtst))]
count_correct = 0
for i in range(0, len(Xtst)):
    #probability x1 given class 1
    Px1_1 = (1 / (np.sqrt(var1_1)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,0]-train1_col1_mean) / (2*var1_1) ) )
    #probability x2 given class 1
    Px1_2 = (1 / (np.sqrt(var1_2)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,1]-train1_col2_mean) / (2*var1_2) ) )
    #p(c0)*p(x1|c0)*p(x2|c0)
    Px1 = (Px1_1 * Px1_2)*(len(Xtrn_1_1)/len(Xtrn))
    Xprob_tst[i][1] = Px1
    #print("Probability of X%d being in Class 1 = %f " % (i,Px1))
     #probability x1 given class 0
    Px0_1 = (1 / (np.sqrt(var0_1)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,0]-train0_col1_mean) / (2*var0_1) ) )
    #probability x2 given class 0
    Px0_2 = (1 / (np.sqrt(var0_2)*2*np.pi)) * np.exp( (-1*np.square(Xtst[i,1]-train0_col2_mean) / (2*var0_2) ) )
    #p(c0)*p(x1|c0)*p(x2|c0)
    Px0 = (Px0_1 * Px0_2)*(len(Xtrn_0_1)/len(Xtrn))
    Xprob_tst[i][0] = Px0
    #print("Probability of X%d being in Class 0 = %f " % (i,Px0))
    if Xprob_tst[i][0] > Xprob_tst[i][1]:
        Xprob_tst[i][2] = 0
    elif Xprob_tst[i][0] < Xprob_tst[i][1]:
        Xprob_tst[i][2] = 1 
    else:
        Xprob_tst[i][2] = random.randint(0, 1) #if both probabilities are equal, randomly assign
    #print("X%d is predicted to be in Class %d and is actually in Class %d \n" % (i,Xprob_tst[i][2],Ytst[i]))
    if Xprob_tst[i][2] == Ytst[i]:
        count_correct += 1

print("Test Set: %d correct classifications, %d incorrect." % (count_correct, len(Xtst)-count_correct))

Output:

Length of training set: 400
Length of test set: 400

Number of Class 0 items in training set: 200
Class 0 Column 1 mean: 0.120442, variance 0.902375
Class 0 Column 2 mean: 0.136034, variance 8.359462

Number of Class 1 items in training set: 200
Class 1 Column 1 mean: 2.043920, variance 0.905506
Class 1 Column 2 mean: 1.998865, variance 8.778899

Training Set: 342 correct classifications, 58 incorrect.
Test Set: 346 correct classifications, 54 incorrect.