In this guide, we take a look at Natural Language Processing, NLP in Python. Natural Language is a very extensive topic and very powerful machine learning algorithm you can build.
This is very useful in many areas of most industries.
Credits to Jose Portilla, creator of Learning Python for Data Analysis and Visualization course on Udemy
# conda install nltk #at command prompt or here in jupyter
#import nltk # import nltk
#nltk.download() # to downlaod the necessary stuffs needed
import nltk
#read the file in the smszip folder
messages = [line.rstrip() for line in open('smsspamcollection/SMSSpamCollection')]
print(len(messages))
#lets check messages in the folder
for num, message in enumerate(messages[:10]):
print(num, message)
print('\n')
#lets read the spam file into a pandas dataframe
import pandas as pd
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['labels','message']) # pass names to label the colums
messages.head()
#lets check some statistics about our data
messages.describe()
messages.info()
#lets groupby our label and check some statistics
messages.groupby('labels').describe()
#lets get the length of each message
messages['length'] = messages['message'].apply(len)
messages.head()
#lets visualize our data
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#%matplotlib qt
#check statistics to choose bin size for histogram
messages['length'].describe()
messages['length'].plot(bins=50, kind='hist')
#ets get the messasge with the longest text and the text itself
messages[messages['length']==910]['message'].iloc[0]
#lets plot our histograms by labels
messages.hist(column='length', by='labels', bins=50, figsize=(20,8))
#our goal is to process all the text in the message column of our dataframe to some form of numerical vector so we can build our
#model
import string
#less try to get rid of some symbols in the texts in the messages, by first trying our hands on some small text
mess = 'Sample message! Notice: It containes punctuation.'
#lets check the list of punctuations found in string
string.punctuation
nopunc = [char for char in mess if char not in string.punctuation]
nopunc
#now lets join characters without the punctuations back into who words
nopunc = ''.join(nopunc)
nopunc
#now lets remove stop words
from nltk.corpus import stopwords
#lets check some stopwords
stopwords.words('english')[0:10]
#now lets split the words in our nopunc message into a list
nopunc = nopunc.split()
nopunc
#now lets remove all stop words from this list
clean_mess = [word for word in nopunc if word.lower() not in stopwords.words('english')]
clean_mess
#lets put the above cleaning steps into a function
def text_process(mess):
#check characters to see if they are in punctuation
nopunc = [char for char in mess if char not in string.punctuation]
#now lets join characters without the punctuations back into who words
nopunc = ''.join(nopunc)
#now remove all stop words from this list
return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
#lest check our real live messages.
messages.head()
#lest tokenize our message, that is convert our normal text strings into a list of tokens which are the words we want
#lets check a quick example
messages['message'].head(5).apply(text_process)
#lets vectorize our text
from sklearn.feature_extraction.text import CountVectorizer
#lets call the ContVectorizer constructor and pass our function to it
bow_transformer = CountVectorizer(analyzer=text_process)
#fit the model to the messages
bow_transformer.fit(messages['message'])
#lets see some sample message
message4 = messages['message'][3]
message4
#lets check how message4 has been transformed
bag_of_words_4 = bow_transformer.transform([message4])
print(bag_of_words_4)
from the results above, the numbers in the paranthesis to the side of the zeros are the words. and the numbers outside the paranthesis are their respective counts. this means there are 9 uniques words in the message 4 and the word coded '2' appears 10 times
#we can see the actual words using their codes from the transformer
print(bow_transformer.get_feature_names()[9554])
print(bow_transformer.get_feature_names()[4068])
#now we can see it works so lets apply it on our data messages in our dataframe
messages_bow = bow_transformer.transform(messages['message'])
#lets check some features
print('Shape of sparse matrix: ', messages_bow.shape)
print('Amount of non-zero occurences: ', messages_bow.nnz )
print('Sparsity : %.2f%%' % (100.0* messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))
%config IPCompleter.greedy=True
#fit TfidfTransformer to our messages
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(messages_bow)
#lets transform a single message and see
tfidf4 = tfidf_transformer.transform(bag_of_words_4)
print(tfidf4)
#lets the inverse document frequency number of some words
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
#lets transform the entire messages
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)
#now we are going to train our model
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['labels'])
#lets check the accuracy of the prediction on a single messasge
print('Predicted :', spam_detect_model.predict(tfidf4)[0])
print('Expected : ', messages['labels'][3])
#lets see how our model will perform
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)
#lets evaluate our model using precision and recall
from sklearn.metrics import classification_report
print(classification_report(messages['labels'], all_predictions))
#in the above model training we used the same trained data for a our validation. # that is not a good practice so lets split our data into training and test sets from sklearn.cross_validation import train_test_split msg_train, msg_test, label_train, label_test = train_test_split(messages['message'], messages['labels'], test_size=0.2)
#lets check the size of the training and test data sets
print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))
#lets set all our models and preprocessing in single pipeline
from sklearn.pipeline import Pipeline
#lets fit our steps in a pipeline
pipeline = Pipeline([('bow', CountVectorizer(analyzer=text_process)),
('tfidf',TfidfTransformer()),
('classifier', MultinomialNB())])
#lets fit our model
pipeline.fit(msg_train, label_train)
#now lets make the prediction our test data
predictions = pipeline.predict(msg_test)
print(classification_report(predictions, label_test))
From the predicitons we had 98% precision and 97% recall and the f1-score is 97%. We seek to improve it more. 🙂