使用神经网络-垃圾邮件检测-LSTM或者CNN(一维卷积)效果都不错【代码有问题,pass】
阅读原文时间:2023年07月08日阅读:1

from sklearn.feature_extraction.text import CountVectorizer
import os
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.feature_extraction.text import TfidfTransformer
import tensorflow as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.conv import conv_2d, max_pool_2d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from sklearn.neural_network import MLPClassifier
from tflearn.layers.normalization import local_response_normalization
from tensorflow.contrib import learn

max_features=500
max_document_length=1024

def load_one_file(filename):
x=""
with open(filename) as f:
for line in f:
line=line.strip('\n')
line = line.strip('\r')
x+=line
return x

def load_files_from_dir(rootdir):
x=[]
list = os.listdir(rootdir)
for i in range(0, len(list)):
path = os.path.join(rootdir, list[i])
if os.path.isfile(path):
v=load_one_file(path)
x.append(v)
return x

def load_all_files():
ham=[]
spam=[]
for i in range(1,5):
path="../data/mail/enron%d/ham/" % i
print "Load %s" % path
ham+=load_files_from_dir(path)
path="../data/mail/enron%d/spam/" % i
print "Load %s" % path
spam+=load_files_from_dir(path)
return ham,spam

def get_features_by_wordbag():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
return x,y

def show_diffrent_max_features():
global max_features
a=[]
b=[]
for i in range(1000,20000,2000):
max_features=i
print "max_features=%d" % i
x, y = get_features_by_wordbag()
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
score=metrics.accuracy_score(y_test, y_pred)
a.append(max_features)
b.append(score)
plt.plot(a, b, 'r')
plt.xlabel("max_features")
plt.ylabel("metrics.accuracy_score")
plt.title("metrics.accuracy_score VS max_features")
plt.legend()
plt.show()

def do_nb_wordbag(x_train, x_test, y_train, y_test):
print "NB and wordbag"
gnb = GaussianNB()
gnb.fit(x_train,y_train)
y_pred=gnb.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred)

def do_svm_wordbag(x_train, x_test, y_train, y_test):
print "SVM and wordbag"
clf = svm.SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print metrics.accuracy_score(y_test, y_pred)
print metrics.confusion_matrix(y_test, y_pred)

def get_features_by_wordbag_tfidf():
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vectorizer = CountVectorizer(binary=True,
decode_error='ignore',
strip_accents='ascii',
max_features=max_features,
stop_words='english',
max_df=1.0,
min_df=1 )
print vectorizer
x=vectorizer.fit_transform(x)
x=x.toarray()
transformer = TfidfTransformer(smooth_idf=False)
print transformer
tfidf = transformer.fit_transform(x)
x = tfidf.toarray()
return x,y

def do_cnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "CNN and tf"

trainX = pad\_sequences(trainX, maxlen=max\_document\_length, value=0.)  
testX = pad\_sequences(testX, maxlen=max\_document\_length, value=0.)  
# Converting labels to binary vectors  
trainY = to\_categorical(trainY, nb\_classes=2)  
testY = to\_categorical(testY, nb\_classes=2)

# Building convolutional network  
network = input\_data(shape=\[None,max\_document\_length\], name='input')  
network = tflearn.embedding(network, input\_dim=1000000, output\_dim=128)  
branch1 = conv\_1d(network, 128, 3, padding='valid', activation='relu', regularizer="L2")  
branch2 = conv\_1d(network, 128, 4, padding='valid', activation='relu', regularizer="L2")  
branch3 = conv\_1d(network, 128, 5, padding='valid', activation='relu', regularizer="L2")  
network = merge(\[branch1, branch2, branch3\], mode='concat', axis=1)  
network = tf.expand\_dims(network, 2)  
network = global\_max\_pool(network)  
network = dropout(network, 0.8)  
network = fully\_connected(network, 2, activation='softmax')  
network = regression(network, optimizer='adam', learning\_rate=0.001,  
                     loss='categorical\_crossentropy', name='target')  
# Training  
model = tflearn.DNN(network, tensorboard\_verbose=0)  
model.fit(trainX, trainY,  
          n\_epoch=5, shuffle=True, validation\_set=(testX, testY),  
          show\_metric=True, batch\_size=100,run\_id="spam")

def do_rnn_wordbag(trainX, testX, trainY, testY):
global max_document_length
print "RNN and wordbag"

trainX = pad\_sequences(trainX, maxlen=max\_document\_length, value=0.)  
testX = pad\_sequences(testX, maxlen=max\_document\_length, value=0.)  
# Converting labels to binary vectors  
trainY = to\_categorical(trainY, nb\_classes=2)  
testY = to\_categorical(testY, nb\_classes=2)

# Network building  
net = tflearn.input\_data(\[None, max\_document\_length\])  
net = tflearn.embedding(net, input\_dim=10240000, output\_dim=128)  
net = tflearn.lstm(net, 128, dropout=0.8)  
net = tflearn.fully\_connected(net, 2, activation='softmax')  
net = tflearn.regression(net, optimizer='adam', learning\_rate=0.001,  
                         loss='categorical\_crossentropy')

# Training  
model = tflearn.DNN(net, tensorboard\_verbose=0)  
model.fit(trainX, trainY, validation\_set=(testX, testY), show\_metric=True,  
          batch\_size=10,run\_id="spm-run",n\_epoch=5)

def do_dnn_wordbag(x_train, x_test, y_train, y_testY):
print "DNN and wordbag"

# Building deep neural network  
clf = MLPClassifier(solver='lbfgs',  
                    alpha=1e-5,  
                    hidden\_layer\_sizes = (5, 2),  
                    random\_state = 1)  
print  clf  
clf.fit(x\_train, y\_train)  
y\_pred = clf.predict(x\_test)  
print metrics.accuracy\_score(y\_test, y\_pred)  
print metrics.confusion\_matrix(y\_test, y\_pred)

def get_features_by_tf():
global max_document_length
x=[]
y=[]
ham, spam=load_all_files()
x=ham+spam
y=[0]*len(ham)+[1]*len(spam)
vp=tflearn.data_utils.VocabularyProcessor(max_document_length=max_document_length,
min_frequency=0,
vocabulary=None,
tokenizer_fn=None)
x=vp.fit_transform(x, unused_y=None)
x=np.array(list(x))
return x,y

if __name__ == "__main__":
print "Hello spam-mail"
#print "get_features_by_wordbag"
#x,y=get_features_by_wordbag()
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state = 0)

#print "get\_features\_by\_wordbag\_tfidf"  
#x,y=get\_features\_by\_wordbag\_tfidf()  
#x\_train, x\_test, y\_train, y\_test = train\_test\_split(x, y, test\_size = 0.4, random\_state = 0)  
#NB  
#do\_nb\_wordbag(x\_train, x\_test, y\_train, y\_test)  
#show\_diffrent\_max\_features()

#SVM  
#do\_svm\_wordbag(x\_train, x\_test, y\_train, y\_test)

#DNN  
#do\_dnn\_wordbag(x\_train, x\_test, y\_train, y\_test)

print "get\_features\_by\_tf"  
x,y=get\_features\_by\_wordbag()  
x\_train, x\_test, y\_train, y\_test = train\_test\_split(x, y, test\_size = 0.4, random\_state = 0)  
#CNN  
do\_cnn\_wordbag(x\_train, x\_test, y\_train, y\_test)

#RNN  
#do\_rnn\_wordbag(x\_train, x\_test, y\_train, y\_test)

自己写检测算法的时候也记得多个算法比较下