import helpers
import figures


import pandas as pd

data = pd.read_csv("spam.csv",encoding='latin-1')
data.dtypes

v1            object
v2            object
Unnamed: 2    object
Unnamed: 3    object
Unnamed: 4    object
dtype: object


data.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64


data.count()

v1            5572
v2            5572
Unnamed: 2      50
Unnamed: 3      12
Unnamed: 4       6
dtype: int64


data = data.drop(["Unnamed: 2","Unnamed: 3","Unnamed: 4"],axis=1)


data = data.rename(columns={"v1":"class","v2":"msg"})
data


import matplotlib.pyplot as plt

figures.plot_dist_classes(data['class'].value_counts())


ham_size = data.groupby(['class']).size().ham
spam_size = data.groupby(['class']).size().spam
print("\nAmount of Ham: %d \nAmount of spam %d \n" %(ham_size, spam_size))

Amount of Ham: 4825 
Amount of spam 747


spam = data[data['class'] == 'spam']
ham = data[data['class'] == 'ham']

spam_words = helpers.get_words(spam.msg)
ham_words = helpers.get_words(ham.msg)


figures.plot_wordclouds(spam_words,ham_words)


from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/piercson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True


from wordcloud import WordCloud, STOPWORDS
import re
def clean_text(col, technique):
    stopwords = STOPWORDS
    output = []
    if technique == 'PorterStemmer':
        tech = PorterStemmer()
        get_root = tech.stem
    else:
        tech = WordNetLemmatizer()
        get_root = tech.lemmatize
    
    tokens = col.split()
    for token in tokens:
        cleaned_token = token.lower()
        if token in stopwords:
            continue
        cleaned_token = re.sub(r'[0-9]+', '', cleaned_token)
        cleaned_token = re.sub(r'[^\w\s]', '', cleaned_token)
        cleaned_token = re.sub(r'[_]', '', cleaned_token)
        cleaned_token = get_root(cleaned_token)
        output.append(cleaned_token)
    output = list(filter(lambda x: x!='',output)) 
    return " ".join(word for word in output)

data['cleaned_stemm'] = data.msg.apply(clean_text,technique='PorterStemmer')
data['cleaned_lemm'] = data.msg.apply(clean_text,technique='WordNetLemmatizer')

print("\nCleaning Techniques!")
print("\nOriginal: %s" % data.head(1).msg.values)
print("PorterStemmer: %s" % data.head(1).cleaned_stemm.values)
print("Lemmatizer: %s" % data.head(1).cleaned_lemm.values)

Cleaning Techniques!

Original: ['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...']
PorterStemmer: ['go jurong point crazi avail bugi n great world la e buffet cine got amor wat']
Lemmatizer: ['go jurong point crazy available bugis n great world la e buffet cine got amore wat']


from sklearn.model_selection import train_test_split

X = data.cleaned_stemm
y = data['class']
classifications = ["ham","spam"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y,train_size=0.8, test_size=0.2,random_state=0)


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
import seaborn as sns

nb_pipe = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB()),
                  ])

model_nb = nb_pipe.fit(X_train, y_train)
y_pred = model_nb.predict(X_valid)


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Multinomial Naive Bayes Accuracy %f \n' % accuracy_score(y_pred, y_valid))
print(classification_report(y_valid, y_pred,target_names=classifications))

Multinomial Naive Bayes Accuracy 0.954260 

              precision    recall  f1-score   support

         ham       0.95      1.00      0.97       949
        spam       1.00      0.69      0.82       166

    accuracy                           0.95      1115
   macro avg       0.97      0.85      0.90      1115
weighted avg       0.96      0.95      0.95      1115


figures.plot_heatmap(y_valid,y_pred, classifications,"MultiNB Predicitions")


from sklearn.linear_model import SGDClassifier 
svm_pipe = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('linear_svm', SGDClassifier(loss='hinge',penalty='l2', alpha=1e-5, random_state=0, max_iter=1000)),
                  ])
model_svm = svm_pipe.fit(X_train, y_train)
y_pred = model_svm.predict(X_valid)


print('Support Vector Machine Accuracy: %s' % accuracy_score(y_pred, y_valid))

Support Vector Machine Accuracy: 0.979372197309417


print(classification_report(y_valid, y_pred,target_names=classifications))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99       949
        spam       0.97      0.89      0.93       166

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115


figures.plot_heatmap(y_valid,y_pred, classifications ,"SVM Predicitions")


y_valid_ch = y_valid.reset_index()
not_ham_but_spam = []
for i, row in y_valid_ch.iterrows():
    if(row['class'] == "ham" and y_pred[i] == "spam"):
        not_ham_but_spam.append(X_valid.loc[row["index"]])
not_ham_but_spam

['i like new mobil',
 'sthi will increas chanc win',
 'cheer messag zogtoriu iåõv stare phone age decid whether text not',
 'god ask what forgiv a littl child gave love repli',
 'u messag']


from sklearn.linear_model import LogisticRegression
reg_pipe = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('regression', LogisticRegression()),
                  ])
model_reg = reg_pipe.fit(X_train, y_train)
y_pred = model_reg.predict(X_valid)


print('Logistic Regression Accuracy: %s' % accuracy_score(y_pred, y_valid))
print(classification_report(y_valid, y_pred,target_names=classifications))

Logistic Regression Accuracy: 0.9623318385650225
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       949
        spam       1.00      0.75      0.86       166

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.96      0.96      0.96      1115


figures.plot_heatmap(y_valid,y_pred, classifications,"Regression Predicitions")


#Get the occurances from the words
from collections import Counter

# Count the Unique words
def count_words(text):
    count = Counter()
    for i in text.values:
        for word in i.split():
            count[word] += 1
    return count


# Create Column for number or words in the SMS
data["num_words"] = data.cleaned_stemm.apply(lambda x: len(x.split(" ")))

figures.plot_num_words_per_sms(data["num_words"])

print("The mean is %d and median is %d" % (data["num_words"].mean(), data["num_words"].median()) )
print("Lowest value is %d and the largest values is %d\n" % (data["num_words"].min(), data["num_words"].max()) )

The mean is 10 and median is 8
Lowest value is 1 and the largest values is 89


import numpy as np
import math 
# Features
X = data.cleaned_stemm
y = np.array(data['class'].apply(lambda x: classifications.index(x))) # Classifation (0-1)

words = count_words(X)
num_words = len(words)
max_words = math.floor(np.percentile(data["num_words"], 99))

print("Number of unique words is %d" % num_words)
print("Length of %d includes 99%% of the words" % max_words)

Number of unique words is 7232
Length of 33 includes 99% of the words


from keras.preprocessing.text import Tokenizer

# Create Index of words on dataset
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X)


# Partition Data
train_size = int(X.shape[0] * 0.8)
train_sentances = X[:train_size]
train_labels = y[:train_size]

valid_sentances = X[train_size:]
valid_labels = y[train_size:]


print("Training set contains %d spam words which is ~%d%% of the spam entries" % (train_labels.sum(), (train_labels.sum() / spam_size) * 100))
print("Validiation set contains %d spam words which is ~%d%% of the spam entries" % (valid_labels.sum(), (valid_labels.sum() / spam_size) * 100))

Training set contains 602 spam words which is ~80% of the spam entries
Validiation set contains 145 spam words which is ~19% of the spam entries


# Turn to sequences from the tokenizer
Xtrain_sequences = tokenizer.texts_to_sequences(train_sentances)
Xvalid_sequences = tokenizer.texts_to_sequences(valid_sentances)


from keras.preprocessing.sequence import pad_sequences

Xtrain_padded = pad_sequences(Xtrain_sequences, maxlen=max_words, padding="post", truncating="post")
Xvalid_padded = pad_sequences(Xvalid_sequences, maxlen=max_words, padding="post", truncating="post")


from keras.models import Sequential
from tensorflow import keras
import tensorflow as tf

model_lstm = keras.Sequential()

model_lstm.add(keras.layers.Embedding(num_words,32,input_length=max_words))
model_lstm.add(keras.layers.LSTM(32, dropout=0.5))
model_lstm.add(keras.layers.Dense(1, activation="sigmoid"))

optimizer = tf.keras.optimizers.Adam()

model_lstm.compile(
        loss=tf.keras.losses.BinaryCrossentropy(),
        optimizer=optimizer, 
        metrics=["accuracy"])


model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 embedding (Embedding)       (None, 33, 32)            231424    
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 1)                 33        
                                                                 
=================================================================
Total params: 239,777
Trainable params: 239,777
Non-trainable params: 0
_________________________________________________________________


history = model_lstm.fit(Xtrain_padded, train_labels, epochs=20, validation_data=(Xvalid_padded, valid_labels),)

Epoch 1/20
140/140 [==============================] - 3s 12ms/step - loss: 0.2961 - accuracy: 0.9028 - val_loss: 0.0809 - val_accuracy: 0.9848
Epoch 2/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0586 - accuracy: 0.9863 - val_loss: 0.0531 - val_accuracy: 0.9830
Epoch 3/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0299 - accuracy: 0.9944 - val_loss: 0.0464 - val_accuracy: 0.9848
Epoch 4/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0255 - accuracy: 0.9951 - val_loss: 0.0457 - val_accuracy: 0.9821
Epoch 5/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0137 - accuracy: 0.9971 - val_loss: 0.0509 - val_accuracy: 0.9812
Epoch 6/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0124 - accuracy: 0.9969 - val_loss: 0.0455 - val_accuracy: 0.9830
Epoch 7/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0064 - accuracy: 0.9987 - val_loss: 0.0816 - val_accuracy: 0.9839
Epoch 8/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0065 - accuracy: 0.9991 - val_loss: 0.0586 - val_accuracy: 0.9874
Epoch 9/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0059 - accuracy: 0.9982 - val_loss: 0.0637 - val_accuracy: 0.9839
Epoch 10/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0160 - accuracy: 0.9957 - val_loss: 0.0639 - val_accuracy: 0.9839
Epoch 11/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0231 - accuracy: 0.9964 - val_loss: 0.0838 - val_accuracy: 0.9821
Epoch 12/20
140/140 [==============================] - 1s 10ms/step - loss: 0.0066 - accuracy: 0.9991 - val_loss: 0.1001 - val_accuracy: 0.9812
Epoch 13/20
140/140 [==============================] - 1s 10ms/step - loss: 0.0054 - accuracy: 0.9993 - val_loss: 0.1030 - val_accuracy: 0.9794
Epoch 14/20
140/140 [==============================] - 1s 10ms/step - loss: 0.0183 - accuracy: 0.9957 - val_loss: 0.1110 - val_accuracy: 0.9803
Epoch 15/20
140/140 [==============================] - 1s 10ms/step - loss: 0.0042 - accuracy: 0.9996 - val_loss: 0.1019 - val_accuracy: 0.9803
Epoch 16/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0085 - accuracy: 0.9973 - val_loss: 0.1116 - val_accuracy: 0.9821
Epoch 17/20
140/140 [==============================] - 1s 10ms/step - loss: 0.0054 - accuracy: 0.9991 - val_loss: 0.1240 - val_accuracy: 0.9776
Epoch 18/20
140/140 [==============================] - 1s 10ms/step - loss: 8.5602e-04 - accuracy: 0.9998 - val_loss: 0.1447 - val_accuracy: 0.9776
Epoch 19/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0019 - accuracy: 0.9998 - val_loss: 0.1566 - val_accuracy: 0.9776
Epoch 20/20
140/140 [==============================] - 1s 9ms/step - loss: 0.0038 - accuracy: 0.9996 - val_loss: 0.1179 - val_accuracy: 0.9803


results = model_lstm.evaluate(Xvalid_padded, valid_labels)
print("LSTM Accuracy is %s" % results[1])

35/35 [==============================] - 0s 2ms/step - loss: 0.1179 - accuracy: 0.9803
LSTM Accuracy is 0.9802690744400024


# Since we only have 2 classes, if the probability is greater than 50% then we mark it as spam (1)
classes_x=(model_lstm.predict(Xvalid_padded) > 0.5).astype("int32")
classes_x
prediction =  []
actual = []
for item in classes_x:
    prediction.append(classifications[item[0]])
for item in valid_labels:
    actual.append(classifications[item])
print(classification_report(actual, prediction,target_names=classifications))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       970
        spam       0.97      0.88      0.92       145

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115


history_dict = history.history
history_dict.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])


acc = history_dict['accuracy']
val_loss = history_dict['val_loss']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
figures.plot_loss(history_dict)


figures.plot_acc(history_dict)


very_much_spam = "Respond ASAP to clain a free alaska cruise for two people, text FREE to this number to claim your prize"
a_msg_from_my_mom = "Remember the noodle recipe you wrote when you were young? I framed it and i'm going to hang it in the kitchen. it looks so cute, i ll send a pic when it's up"
spam_clean = clean_text(very_much_spam, 'PorterStemmer')
mom_clean = clean_text(a_msg_from_my_mom, 'PorterStemmer')
# Spam
nm_spam_prediction = model_nb.predict([spam_clean])
svm_spam_prediction = model_svm.predict([spam_clean])
reg_spam_prediction = model_reg.predict([spam_clean])

spam_sequences = tokenizer.texts_to_sequences([spam_clean])
spam_padded = pad_sequences(spam_sequences, maxlen=max_words, padding="post", truncating="post")
lstm_spam_prediction=(model_lstm.predict(spam_padded) > 0.5).astype("int32")
print("For the SMS\n%s\n" % very_much_spam)
print("MultiNB predicts this is a %s SMS" % (nm_spam_prediction[0]))
print("SVM predicts this is a %s SMS" % (svm_spam_prediction[0]))
print("Regression predicts this is a %s SMS" % (reg_spam_prediction[0]))
print("LSTM predicts this is a %s SMS" % (classifications[lstm_spam_prediction[0][0]]))

# Mom Message
nm_mom_prediction = model_nb.predict([mom_clean])
svm_mom_prediction = model_svm.predict([mom_clean])
reg_mom_prediction = model_reg.predict([mom_clean])

mom_sequences = tokenizer.texts_to_sequences([mom_clean])
mom_padded = pad_sequences(mom_sequences, maxlen=max_words, padding="post", truncating="post")
lstm_mom_prediction=(model_lstm.predict(mom_padded) > 0.5).astype("int32")

print("\nFor the SMS\n%s\n" % a_msg_from_my_mom)
print("MultiNB predicts this is a %s SMS" % (nm_mom_prediction[0]))
print("SVM predicts this is a %s SMS" % (svm_mom_prediction[0]))
print("Regression predicts this is a %s SMS" % (reg_mom_prediction[0]))
print("LSTM predicts this is a %s SMS" % (classifications[lstm_mom_prediction[0][0]]))

For the SMS
Respond ASAP to clain a free alaska cruise for two people, text FREE to this number to claim your prize

MultiNB predicts this is a spam SMS
SVM predicts this is a spam SMS
Regression predicts this is a spam SMS
LSTM predicts this is a spam SMS

For the SMS
Remember the noodle recipe you wrote when you were young? I framed it and i'm going to hang it in the kitchen. it looks so cute, i ll send a pic when it's up

MultiNB predicts this is a ham SMS
SVM predicts this is a ham SMS
Regression predicts this is a ham SMS
LSTM predicts this is a ham SMS

Spam Detection with SMS Data¶

What does the data look like?¶

Creating the Models¶

Mutltinomial Naive Bayes¶

Support Vector Machine¶

Logistical Regression¶

Tensorflow¶

Conclusion¶

	class	msg
0	ham	Go until jurong point, crazy.. Available only ...
1	ham	Ok lar... Joking wif u oni...
2	spam	Free entry in 2 a wkly comp to win FA Cup fina...
3	ham	U dun say so early hor... U c already then say...
4	ham	Nah I don't think he goes to usf, he lives aro...
...	...	...
5567	spam	This is the 2nd time we have tried 2 contact u...
5568	ham	Will Ì_ b going to esplanade fr home?
5569	ham	Pity, * was in mood for that. So...any other s...
5570	ham	The guy did some bitching but I acted like i'd...
5571	ham	Rofl. Its true to its name