朴素贝叶斯算法的实例 -电脑资料

电脑资料时间：2019-01-01 我要投稿

【www.unjs.com - 电脑资料】

贝叶斯的应用

过滤垃圾邮件

贝叶斯分类器的著名的应用就是垃圾邮件过滤了，这方面推荐想详细了解的可以去看看《与画家》或是《数学之美》中对应的章节，贝叶斯的基础实现看这里

数据集

两个文件夹，分别是正常邮件和垃圾邮件，其中各有25封邮件

测试方法

从50封邮件中随机选取10封做为测试数据

实现细节

1.首先我们需要将文本转成我们需要的向量的样子，这里需要使用一点正则表达式

2.由于采取交叉验证的方式，随机过程会导致每次的结果不尽相同

复制代码

1 #coding=utf-8

2 from numpy import *

4 #解析文档的函数

5 def textParse(bigString):

6 import re

7 listOfTokens = re.split(r'\W*',bigString)

8 return [tok.lower() for tok in listOfTokens if len(tok) > 2]

11 #创建一个带有所有单词的列表

12 def createVocabList(dataSet):

13 vocabSet = set([])

14 for document in dataSet:

15 vocabSet = vocabSet | set(document)

16 return list(vocabSet)

18 def setOfWords2Vec(vocabList, inputSet):

19 retVocabList = [0] * len(vocabList)

20 for word in inputSet:

21 if word in vocabList:

22 retVocabList[vocabList.index(word)] = 1

23 else:

24 print 'word ',word ,'not in dict'

25 return retVocabList

27 #另一种模型

28 def bagOfWords2VecMN(vocabList, inputSet):

29 returnVec = [0]*len(vocabList)

30 for word in inputSet:

31 if word in vocabList:

32 returnVec[vocabList.index(word)] += 1

33 return returnVec

35 def trainNB0(trainMatrix,trainCatergory):

36 numTrainDoc = len(trainMatrix)

37 numWords = len(trainMatrix[0])

38 pAbusive = sum(trainCatergory)/float(numTrainDoc)

39 #防止多个概率的成绩当中的一个为0

40 p0Num = ones(numWords)

41 p1Num = ones(numWords)

42 p0Denom = 2.0

43 p1Denom = 2.0

44 for i in range(numTrainDoc):

45 if trainCatergory[i] == 1:

46 p1Num +=trainMatrix[i]

47 p1Denom += sum(trainMatrix[i])

48 else:

49 p0Num +=trainMatrix[i]

50 p0Denom += sum(trainMatrix[i])

51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑，否则很可能到限归零

52 p0Vect = log(p0Num/p0Denom)

53 return p0Vect,p1Vect,pAbusive

55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult

57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)

58 if p1 > p0:

59 return 1

60 else:

61 return 0

63 def spamTest(spamFloder, hamFloder):

64 docList = []

65 classList = []

66 fullText = []

67 for i in range(1,26):

68 wordList = textParse(open(spamFloder+str(i)+'.txt').read())

69 docList.append(wordList)

70 fullText.extend(wordList)

71 classList.append(1)

72 wordList = textParse(open(hamFloder+str(i)+'.txt').read())

73 docList.append(wordList)

74 fullText.extend(wordList)

75 classList.append(0)

76 vocabList = createVocabList(docList)

77 trainingSet = range(50)

78 testSet = []

79 for i in range(10):

80 randIndex = int(random.uniform(0,len(trainingSet)))

81 testSet.append(trainingSet[randIndex])

82 del(trainingSet[randIndex])

83 trainMat = []

84 trianClasses = []

85 print trainingSet

86 for docIndex in trainingSet:

87 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))

88 #trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

89 trianClasses.append(classList[docIndex])

90 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trianClasses))

91 errorCount = 0

92 for docIndex in testSet: #classify the remaining items

93 #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

94 wordVector = setOfWords2Vec(vocabList, docList[docIndex])

95 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:

96 errorCount += 1

97 print "classification error",docList[docIndex]

98 print 'the error rate is: ',float(errorCount)/len(testSet)

99 #return vocabList,fullText

100

101

102 def main():

103 spamTest('email/spam/','email/ham/')

104

105 if __name__ == '__main__':

106 main()

复制代码

从个人广告中获取地区倾向

这个是从某个网站上提取了不同地区板块的信息，分析他们的用词是不是有某些规律

数据集

这里的数据使用RSS获取的，用到了python的feedparse包，想了解可以看这里.这里分别获取了某网站两个地区板块中的信息

测试方法

交叉验证

实现细节

1.这里有两种字符需要特别处理（其实他们有很大重合），一种是频率最高的一些，另一种是所谓的停用词（我的理解其实就是那些使用频率很高但没什么实际意义的），各种语言的停用词可以看这里，

朴素贝叶斯算法的实例

。我们需要移除这些词以使得结果更能体现出地区差异。

2.getTopWords函数实际上就是对这个概率统计了一下特征。对学习贝叶斯来说不是必要代码

3.除了数据来源不同实现细节和上面的很相似

复制代码

1 #coding=utf-8

2 from numpy import *

4 #解析文档的函数

5 def textParse(bigString):

6 import re

7 listOfTokens = re.split(r'\W*',bigString)

8 return [tok.lower() for tok in listOfTokens if len(tok) > 2]

11 #创建一个带有所有单词的列表

12 def createVocabList(dataSet):

13 vocabSet = set([])

14 for document in dataSet:

15 vocabSet = vocabSet | set(document)

16 return list(vocabSet)

18 def setOfWords2Vec(vocabList, inputSet):

19 retVocabList = [0] * len(vocabList)

20 for word in inputSet:

21 if word in vocabList:

22 retVocabList[vocabList.index(word)] = 1

23 else:

24 print 'word ',word ,'not in dict'

25 return retVocabList

27 #另一种模型

28 def bagOfWords2VecMN(vocabList, inputSet):

29 returnVec = [0]*len(vocabList)

30 for word in inputSet:

31 if word in vocabList:

32 returnVec[vocabList.index(word)] += 1

33 return returnVec

35 def trainNB0(trainMatrix,trainCatergory):

36 numTrainDoc = len(trainMatrix)

37 numWords = len(trainMatrix[0])

38 pAbusive = sum(trainCatergory)/float(numTrainDoc)

39 #防止多个概率的成绩当中的一个为0

40 p0Num = ones(numWords)

41 p1Num = ones(numWords)

42 p0Denom = 2.0

43 p1Denom = 2.0

44 for i in range(numTrainDoc):

45 if trainCatergory[i] == 1:

46 p1Num +=trainMatrix[i]

47 p1Denom += sum(trainMatrix[i])

48 else:

49 p0Num +=trainMatrix[i]

50 p0Denom += sum(trainMatrix[i])

51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑，否则很可能到限归零

52 p0Vect = log(p0Num/p0Denom)

53 return p0Vect,p1Vect,pAbusive

55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult

57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)

58 if p1 > p0:

59 return 1

60 else:

61 return 0

63 def stopWords():

64 stopW = []

65 f = open('stopwords.txt').readlines()

66 for eachLine in f:

67 stopW.append(eachLine[:-1])

68 return stopW

70 def calcMostFreq(vocabList,fullText):

71 import operator

72 freqDict = {}

73 for token in vocabList:

74 freqDict[token]=fullText.count(token)

75 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)

76 return sortedFreq[:30]

78 def localWords(rss1,rss0):

79 import feedparser

80 feed1 = feedparser.parse(rss1)

81 feed0 = feedparser.parse(rss0)

82 docList=[]; classList = []; fullText =[]

83 minLen = min(len(feed1['entries']),len(feed0['entries']))

84 for i in range(minLen):

85 wordList = textParse(feed1['entries'][i]['summary'])

86 docList.append(wordList)

87 fullText.extend(wordList)

88 classList.append(1) #NY is class 1

89 wordList = textParse(feed0['entries'][i]['summary'])

90 docList.append(wordList)

91 fullText.extend(wordList)

92 classList.append(0)

93 vocabList = createVocabList(docList)#create vocabulary

94 top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words

95 for pairW in top30Words:

96 if pairW[0] in vocabList: vocabList.remove(pairW[0])

97 stopW = stopWords()

98 for pairW in stopW:

99 if pairW[0] in vocabList:

100 vocabList.remove(pairW[0])

101 trainingSet = range(2*minLen); testSet=[] #create test set

102 for i in range(20):

103 randIndex = int(random.uniform(0,len(trainingSet)))

104 testSet.append(trainingSet[randIndex])

105 del(trainingSet[randIndex])

106 trainMat=[]; trainClasses = []

107 for docIndex in trainingSet:#train the classifier (get probs) trainNB0

108 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

109 trainClasses.append(classList[docIndex])

110 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))

111 errorCount = 0

112 for docIndex in testSet: #classify the remaining items

113 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

114 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:

115 errorCount += 1

116 print 'the error rate is: ',float(errorCount)/len(testSet)

117 return vocabList,p0V,p1V

118

119 def getTopWords(ny,sf):

120 import operator

121 vocabList,p0V,p1V=localWords(ny,sf)

122 topNY=[]; topSF=[]

123 for i in range(len(p0V)):

124 if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))

125 if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))

126 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)

127 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"

128 for item in sortedSF:

129 print item[0]

130 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)

131 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"

132 for item in sortedNY:

133 print item[0]

134

135 def main():

136 #print stopWords()

137 localWords('http://newyork.craigslist.org/stp/index.rss','http://sfbay.craigslist.org/stp/index.rss')

138

139 if __name__ == '__main__':

相关文章