Spam Detection Jupyter Notebook#
import pandas as pd
sms_spam = pd.read_csv('SMSSpamCollection', sep='\t',
header=None, names=['Label', 'SMS'])
print(sms_spam.shape)
sms_spam.head()
(5572, 2)
Label | SMS | |
---|---|---|
0 | ham | Go until jurong point, crazy.. Available only ... |
1 | ham | Ok lar... Joking wif u oni... |
2 | spam | Free entry in 2 a wkly comp to win FA Cup fina... |
3 | ham | U dun say so early hor... U c already then say... |
4 | ham | Nah I don't think he goes to usf, he lives aro... |
sms_spam['Label'].value_counts(normalize=True)
ham 0.865937
spam 0.134063
Name: Label, dtype: float64
# Randomize the dataset
data_randomized = sms_spam.sample(frac=1, random_state=1)
# Calculate index for split
training_test_index = round(len(data_randomized) * 0.8)
# Split into training and test sets
training_set = data_randomized[:training_test_index].reset_index(drop=True)
test_set = data_randomized[training_test_index:].reset_index(drop=True)
print(training_set.shape)
print(test_set.shape)
(4458, 2)
(1114, 2)
training_set['Label'].value_counts(normalize=True)
ham 0.86541
spam 0.13459
Name: Label, dtype: float64
test_set['Label'].value_counts(normalize=True)
ham 0.868043
spam 0.131957
Name: Label, dtype: float64
# Before cleaning
training_set.head(3)
Label | SMS | |
---|---|---|
0 | ham | Yep, by the pretty sculpture |
1 | ham | Yes, princess. Are you going to make me moan? |
2 | ham | Welp apparently he retired |
# After cleaning
training_set['SMS'] = training_set['SMS'].str.replace(
'\W', ' ') # Removes punctuation
training_set['SMS'] = training_set['SMS'].str.lower()
training_set.head(3)
Label | SMS | |
---|---|---|
0 | ham | yep by the pretty sculpture |
1 | ham | yes princess are you going to make me moan |
2 | ham | welp apparently he retired |
training_set['SMS'] = training_set['SMS'].str.split()
vocabulary = []
for sms in training_set['SMS']:
for word in sms:
vocabulary.append(word)
vocabulary = list(set(vocabulary))
len(vocabulary)
7783
word_counts_per_sms = {'secret': [2,1,1],
'prize': [2,0,1],
'claim': [1,0,1],
'now': [1,0,1],
'coming': [0,1,0],
'to': [0,1,0],
'my': [0,1,0],
'party': [0,1,0],
'winner': [0,0,1]
}
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()
secret | prize | claim | now | coming | to | my | party | winner | |
---|---|---|---|---|---|---|---|---|---|
0 | 2 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 |
2 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 1 |
word_counts_per_sms = {unique_word: [0] * len(training_set['SMS']) for unique_word in vocabulary}
for index, sms in enumerate(training_set['SMS']):
for word in sms:
word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
word_counts.head()
indyarocks | port | eg | surly | trained | voda | organise | vu | leg | 06 | ... | imposter | gee | 4882 | reduce | sexual | nange | conducts | noworriesloans | taxes | book | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 7783 columns
training_set_clean = pd.concat([training_set, word_counts], axis=1)
training_set_clean.head()
Label | SMS | indyarocks | port | eg | surly | trained | voda | organise | vu | ... | imposter | gee | 4882 | reduce | sexual | nange | conducts | noworriesloans | taxes | book | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ham | [yep, by, the, pretty, sculpture] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | ham | [yes, princess, are, you, going, to, make, me,... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | ham | [welp, apparently, he, retired] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | ham | [havent] | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | ham | [i, forgot, 2, ask, ü, all, smth, there, s, a,... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 7785 columns
# Isolating spam and ham messages first
spam_messages = training_set_clean[training_set_clean['Label'] == 'spam']
ham_messages = training_set_clean[training_set_clean['Label'] == 'ham']
# P(Spam) and P(Ham)
p_spam = len(spam_messages) / len(training_set_clean)
p_ham = len(ham_messages) / len(training_set_clean)
# N_Spam
n_words_per_spam_message = spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()
# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
# N_Vocabulary
n_vocabulary = len(vocabulary)
# Laplace smoothing
alpha = 1
# Initiate parameters
parameters_spam = {unique_word:0 for unique_word in vocabulary}
parameters_ham = {unique_word:0 for unique_word in vocabulary}
# Calculate parameters
for word in vocabulary:
n_word_given_spam = spam_messages[word].sum() # spam_messages already defined
p_word_given_spam = (n_word_given_spam + alpha) / (n_spam + alpha*n_vocabulary)
parameters_spam[word] = p_word_given_spam
n_word_given_ham = ham_messages[word].sum() # ham_messages already defined
p_word_given_ham = (n_word_given_ham + alpha) / (n_ham + alpha*n_vocabulary)
parameters_ham[word] = p_word_given_ham
import re
def classify(message):
'''
message: a string
'''
message = re.sub('\W', ' ', message)
message = message.lower().split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
for word in message:
if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
print('P(Spam|message):', p_spam_given_message)
print('P(Ham|message):', p_ham_given_message)
if p_ham_given_message > p_spam_given_message:
print('Label: Ham')
elif p_ham_given_message < p_spam_given_message:
print('Label: Spam')
else:
print('Equal proabilities, have a human classify this!')
classify('WINNER!! This is the secret code to unlock the money: C3421.')
P(Spam|message): 1.3481290211300841e-25
P(Ham|message): 1.9368049028589875e-27
Label: Spam
classify("Sounds good, Tom, then see u there")
P(Spam|message): 2.4372375665888117e-25
P(Ham|message): 3.687530435009238e-21
Label: Ham
def classify_test_set(message):
'''
message: a string
'''
message = re.sub('\W', ' ', message)
message = message.lower().split()
p_spam_given_message = p_spam
p_ham_given_message = p_ham
for word in message:
if word in parameters_spam:
p_spam_given_message *= parameters_spam[word]
if word in parameters_ham:
p_ham_given_message *= parameters_ham[word]
if p_ham_given_message > p_spam_given_message:
return 'ham'
elif p_spam_given_message > p_ham_given_message:
return 'spam'
else:
return 'needs human classification'
test_set['predicted'] = test_set['SMS'].apply(classify_test_set)
test_set.head()
Label | SMS | predicted | |
---|---|---|---|
0 | ham | Later i guess. I needa do mcat study too. | ham |
1 | ham | But i haf enuff space got like 4 mb... | ham |
2 | spam | Had your mobile 10 mths? Update to latest Oran... | spam |
3 | ham | All sounds good. Fingers . Makes it difficult ... | ham |
4 | ham | All done, all handed in. Don't know if mega sh... | ham |
correct = 0
total = test_set.shape[0]
for row in test_set.iterrows():
row = row[1]
if row['Label'] == row['predicted']:
correct += 1
print('Correct:', correct)
print('Incorrect:', total - correct)
print('Accuracy:', correct/total)
Correct: 1100
Incorrect: 14
Accuracy: 0.9874326750448833
tags: !NLPIndex