Skip to content

Spam Detection Jupyter Notebook#

import pandas as pd



sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',

header=None, names=['Label', 'SMS'])



print(sms_spam.shape)

sms_spam.head()

(5572, 2)

                                                                                                                                                                   
LabelSMS
0hamGo until jurong point, crazy.. Available only ...
1hamOk lar... Joking wif u oni...
2spamFree entry in 2 a wkly comp to win FA Cup fina...
3hamU dun say so early hor... U c already then say...
4hamNah I don't think he goes to usf, he lives aro...
sms_spam['Label'].value_counts(normalize=True)

ham     0.865937

spam    0.134063

Name: Label, dtype: float64

# Randomize the dataset

data_randomized = sms_spam.sample(frac=1, random_state=1)



# Calculate index for split

training_test_index = round(len(data_randomized) * 0.8)



# Split into training and test sets

training_set = data_randomized[:training_test_index].reset_index(drop=True)

test_set = data_randomized[training_test_index:].reset_index(drop=True)



print(training_set.shape)

print(test_set.shape)

(4458, 2)

(1114, 2)

training_set['Label'].value_counts(normalize=True)

ham     0.86541

spam    0.13459

Name: Label, dtype: float64

test_set['Label'].value_counts(normalize=True)

ham     0.868043

spam    0.131957

Name: Label, dtype: float64

# Before cleaning

training_set.head(3)
                                                                                                               
LabelSMS
0hamYep, by the pretty sculpture
1hamYes, princess. Are you going to make me moan?
2hamWelp apparently he retired
# After cleaning

training_set['SMS'] = training_set['SMS'].str.replace(

   '\W', ' ') # Removes punctuation

training_set['SMS'] = training_set['SMS'].str.lower()

training_set.head(3)
                                                                                                               
LabelSMS
0hamyep  by the pretty sculpture
1hamyes  princess  are you going to make me moan
2hamwelp apparently he retired
training_set['SMS'] = training_set['SMS'].str.split()



vocabulary = []

for sms in training_set['SMS']:

   for word in sms:

      vocabulary.append(word)



vocabulary = list(set(vocabulary))
len(vocabulary)

7783

word_counts_per_sms = {'secret': [2,1,1],

                       'prize': [2,0,1],

                       'claim': [1,0,1],

                       'now': [1,0,1],

                       'coming': [0,1,0],

                       'to': [0,1,0],

                       'my': [0,1,0],

                       'party': [0,1,0],

                       'winner': [0,0,1]

                      }



word_counts = pd.DataFrame(word_counts_per_sms)

word_counts.head()
                                                                                                                                                                                                                                                                                       
secretprizeclaimnowcomingtomypartywinner
0221100000
1100011110
2111100001
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}



for index, sms in enumerate(training_set['SMS']):

   for word in sms:

      word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)

word_counts.head()
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
indyarocksportegsurlytrainedvodaorganisevuleg06...impostergee4882reducesexualnangeconductsnoworriesloanstaxesbook
00000000000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000

5 rows × 7783 columns

training_set_clean = pd.concat([training_set, word_counts], axis=1)

training_set_clean.head()
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               
LabelSMSindyarocksportegsurlytrainedvodaorganisevu...impostergee4882reducesexualnangeconductsnoworriesloanstaxesbook
0ham[yep, by, the, pretty, sculpture]00000000...0000000000
1ham[yes, princess, are, you, going, to, make, me,...00000000...0000000000
2ham[welp, apparently, he, retired]00000000...0000000000
3ham[havent]00000000...0000000000
4ham[i, forgot, 2, ask, ü, all, smth, there, s, a,...00000000...0000000000

5 rows × 7785 columns

# Isolating spam and ham messages first

spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']

ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']



# P(Spam) and P(Ham)

p_spam = len(spam_messages) / len(training_set_clean)

p_ham = len(ham_messages) / len(training_set_clean)



# N_Spam

n_words_per_spam_message = spam_messages['SMS'].apply(len)

n_spam = n_words_per_spam_message.sum()



# N_Ham

n_words_per_ham_message = ham_messages['SMS'].apply(len)

n_ham = n_words_per_ham_message.sum()



# N_Vocabulary

n_vocabulary = len(vocabulary)



# Laplace smoothing

alpha = 1
# Initiate parameters

parameters_spam = {unique_word:0 for unique_word in vocabulary}

parameters_ham = {unique_word:0 for unique_word in vocabulary}



# Calculate parameters

for word in vocabulary:

   n_word_given_spam = spam_messages[word].sum() # spam_messages already defined

   p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)

   parameters_spam[word] = p_word_given_spam



   n_word_given_ham = ham_messages[word].sum() # ham_messages already defined

   p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)

   parameters_ham[word] = p_word_given_ham
import re



def classify(message):

   '''

   message: a string

   '''



   message = re.sub('\W', ' ', message)

   message = message.lower().split()



   p_spam_given_message = p_spam

   p_ham_given_message = p_ham



   for word in message:

      if word in parameters_spam:

         p_spam_given_message *= parameters_spam[word]



      if word in parameters_ham:

         p_ham_given_message *= parameters_ham[word]



   print('P(Spam|message):', p_spam_given_message)

   print('P(Ham|message):', p_ham_given_message)



   if p_ham_given_message > p_spam_given_message:

      print('Label: Ham')

   elif p_ham_given_message < p_spam_given_message:

      print('Label: Spam')

   else:

      print('Equal proabilities, have a human classify this!')
classify('WINNER!! This is the secret code to unlock the money: C3421.')

P(Spam|message): 1.3481290211300841e-25

P(Ham|message): 1.9368049028589875e-27

Label: Spam

classify("Sounds good, Tom, then see u there")

P(Spam|message): 2.4372375665888117e-25

P(Ham|message): 3.687530435009238e-21

Label: Ham

def classify_test_set(message):

   '''

   message: a string

   '''



   message = re.sub('\W', ' ', message)

   message = message.lower().split()



   p_spam_given_message = p_spam

   p_ham_given_message = p_ham



   for word in message:

      if word in parameters_spam:

         p_spam_given_message *= parameters_spam[word]



      if word in parameters_ham:

         p_ham_given_message *= parameters_ham[word]



   if p_ham_given_message > p_spam_given_message:

      return 'ham'

   elif p_spam_given_message > p_ham_given_message:

      return 'spam'

   else:

      return 'needs human classification'
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)

test_set.head()
                                                                                                                                                                                                       
LabelSMSpredicted
0hamLater i guess. I needa do mcat study too.ham
1hamBut i haf enuff space got like 4 mb...ham
2spamHad your mobile 10 mths? Update to latest Oran...spam
3hamAll sounds good. Fingers . Makes it difficult ...ham
4hamAll done, all handed in. Don't know if mega sh...ham
correct = 0

total = test_set.shape[0]



for row in test_set.iterrows():

   row = row[1]

   if row['Label'] == row['predicted']:

      correct += 1



print('Correct:', correct)

print('Incorrect:', total - correct)

print('Accuracy:', correct/total)

Correct: 1100

Incorrect: 14

Accuracy: 0.9874326750448833



tags: !NLPIndex