贝叶斯的应用
过滤垃圾邮件
贝叶斯分类器的著名的应用就是垃圾邮件过滤了,这方面推荐想详细了解的可以去看看《 与画家》或是《数学之美》中对应的章节,贝叶斯的基础实现看这里
数据集
两个文件夹,分别是正常邮件和垃圾邮件,其中各有25封邮件
测试方法
从50封邮件中随机选取10封做为测试数据
实现细节
1.首先我们需要将文本转成我们需要的向量的样子,这里需要使用一点正则表达式
2.由于采取交叉验证的方式,随机过程会导致每次的结果不尽相同
复制代码
1 #coding=utf-8
2 from numpy import *
3
4 #解析文档的函数
5 def textParse(bigString):
6 import re
7 listOfTokens = re.split(r'\W*',bigString)
8 return [tok.lower() for tok in listOfTokens if len(tok) > 2]
9
10
11 #创建一个带有所有单词的列表
12 def createVocabList(dataSet):
13 vocabSet = set([])
14 for document in dataSet:
15 vocabSet = vocabSet | set(document)
16 return list(vocabSet)
17
18 def setOfWords2Vec(vocabList, inputSet):
19 retVocabList = [0] * len(vocabList)
20 for word in inputSet:
21 if word in vocabList:
22 retVocabList[vocabList.index(word)] = 1
23 else:
24 print 'word ',word ,'not in dict'
25 return retVocabList
26
27 #另一种模型
28 def bagOfWords2VecMN(vocabList, inputSet):
29 returnVec = [0]*len(vocabList)
30 for word in inputSet:
31 if word in vocabList:
32 returnVec[vocabList.index(word)] += 1
33 return returnVec
34
35 def trainNB0(trainMatrix,trainCatergory):
36 numTrainDoc = len(trainMatrix)
37 numWords = len(trainMatrix[0])
38 pAbusive = sum(trainCatergory)/float(numTrainDoc)
39 #防止多个概率的成绩当中的一个为0
40 p0Num = ones(numWords)
41 p1Num = ones(numWords)
42 p0Denom = 2.0
43 p1Denom = 2.0
44 for i in range(numTrainDoc):
45 if trainCatergory[i] == 1:
46 p1Num +=trainMatrix[i]
47 p1Denom += sum(trainMatrix[i])
48 else:
49 p0Num +=trainMatrix[i]
50 p0Denom += sum(trainMatrix[i])
51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零
52 p0Vect = log(p0Num/p0Denom)
53 return p0Vect,p1Vect,pAbusive
54
55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
58 if p1 > p0:
59 return 1
60 else:
61 return 0
62
63 def spamTest(spamFloder, hamFloder):
64 docList = []
65 classList = []
66 fullText = []
67 for i in range(1,26):
68 wordList = textParse(open(spamFloder+str(i)+'.txt').read())
69 docList.append(wordList)
70 fullText.extend(wordList)
71 classList.append(1)
72 wordList = textParse(open(hamFloder+str(i)+'.txt').read())
73 docList.append(wordList)
74 fullText.extend(wordList)
75 classList.append(0)
76 vocabList = createVocabList(docList)
77 trainingSet = range(50)
78 testSet = []
79 for i in range(10):
80 randIndex = int(random.uniform(0,len(trainingSet)))
81 testSet.append(trainingSet[randIndex])
82 del(trainingSet[randIndex])
83 trainMat = []
84 trianClasses = []
85 print trainingSet
86 for docIndex in trainingSet:
87 trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))
88 #trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
89 trianClasses.append(classList[docIndex])
90 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trianClasses))
91 errorCount = 0
92 for docIndex in testSet: #classify the remaining items
93 #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
94 wordVector = setOfWords2Vec(vocabList, docList[docIndex])
95 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
96 errorCount += 1
97 print "classification error",docList[docIndex]
98 print 'the error rate is: ',float(errorCount)/len(testSet)
99 #return vocabList,fullText
100
101
102 def main():
103 spamTest('email/spam/','email/ham/')
104
105 if __name__ == '__main__':
106 main()
复制代码
从个人广告中获取地区倾向
这个是从某个网站上提取了不同地区板块的信息,分析他们的用词是不是有某些规律
数据集
这里的数据使用RSS获取的,用到了python的feedparse包,想了解可以看这里.这里分别获取了某网站两个地区板块中的信息
测试方法
交叉验证
实现细节
1.这里有两种字符需要特别处理(其实他们有很大重合),一种是频率最高的一些,另一种是所谓的停用词(我的理解其实就是那些使用频率很高但没什么实际意义的),各种语言的停用词可以看这里,
朴素贝叶斯算法的实例
。 我们需要移除这些词以使得结果更能体现出地区差异。2.getTopWords函数实际上就是对这个概率统计了一下特征。对学习贝叶斯来说不是必要代码
3.除了数据来源不同实现细节和上面的很相似
复制代码
1 #coding=utf-8
2 from numpy import *
3
4 #解析文档的函数
5 def textParse(bigString):
6 import re
7 listOfTokens = re.split(r'\W*',bigString)
8 return [tok.lower() for tok in listOfTokens if len(tok) > 2]
9
10
11 #创建一个带有所有单词的列表
12 def createVocabList(dataSet):
13 vocabSet = set([])
14 for document in dataSet:
15 vocabSet = vocabSet | set(document)
16 return list(vocabSet)
17
18 def setOfWords2Vec(vocabList, inputSet):
19 retVocabList = [0] * len(vocabList)
20 for word in inputSet:
21 if word in vocabList:
22 retVocabList[vocabList.index(word)] = 1
23 else:
24 print 'word ',word ,'not in dict'
25 return retVocabList
26
27 #另一种模型
28 def bagOfWords2VecMN(vocabList, inputSet):
29 returnVec = [0]*len(vocabList)
30 for word in inputSet:
31 if word in vocabList:
32 returnVec[vocabList.index(word)] += 1
33 return returnVec
34
35 def trainNB0(trainMatrix,trainCatergory):
36 numTrainDoc = len(trainMatrix)
37 numWords = len(trainMatrix[0])
38 pAbusive = sum(trainCatergory)/float(numTrainDoc)
39 #防止多个概率的成绩当中的一个为0
40 p0Num = ones(numWords)
41 p1Num = ones(numWords)
42 p0Denom = 2.0
43 p1Denom = 2.0
44 for i in range(numTrainDoc):
45 if trainCatergory[i] == 1:
46 p1Num +=trainMatrix[i]
47 p1Denom += sum(trainMatrix[i])
48 else:
49 p0Num +=trainMatrix[i]
50 p0Denom += sum(trainMatrix[i])
51 p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零
52 p0Vect = log(p0Num/p0Denom)
53 return p0Vect,p1Vect,pAbusive
54
55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
56 p1 = sum(vec2Classify * p1Vec) + log(pClass1) #element-wise mult
57 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)
58 if p1 > p0:
59 return 1
60 else:
61 return 0
62
63 def stopWords():
64 stopW = []
65 f = open('stopwords.txt').readlines()
66 for eachLine in f:
67 stopW.append(eachLine[:-1])
68 return stopW
69
70 def calcMostFreq(vocabList,fullText):
71 import operator
72 freqDict = {}
73 for token in vocabList:
74 freqDict[token]=fullText.count(token)
75 sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
76 return sortedFreq[:30]
77
78 def localWords(rss1,rss0):
79 import feedparser
80 feed1 = feedparser.parse(rss1)
81 feed0 = feedparser.parse(rss0)
82 docList=[]; classList = []; fullText =[]
83 minLen = min(len(feed1['entries']),len(feed0['entries']))
84 for i in range(minLen):
85 wordList = textParse(feed1['entries'][i]['summary'])
86 docList.append(wordList)
87 fullText.extend(wordList)
88 classList.append(1) #NY is class 1
89 wordList = textParse(feed0['entries'][i]['summary'])
90 docList.append(wordList)
91 fullText.extend(wordList)
92 classList.append(0)
93 vocabList = createVocabList(docList)#create vocabulary
94 top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words
95 for pairW in top30Words:
96 if pairW[0] in vocabList: vocabList.remove(pairW[0])
97 stopW = stopWords()
98 for pairW in stopW:
99 if pairW[0] in vocabList:
100 vocabList.remove(pairW[0])
101 trainingSet = range(2*minLen); testSet=[] #create test set
102 for i in range(20):
103 randIndex = int(random.uniform(0,len(trainingSet)))
104 testSet.append(trainingSet[randIndex])
105 del(trainingSet[randIndex])
106 trainMat=[]; trainClasses = []
107 for docIndex in trainingSet:#train the classifier (get probs) trainNB0
108 trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))
109 trainClasses.append(classList[docIndex])
110 p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
111 errorCount = 0
112 for docIndex in testSet: #classify the remaining items
113 wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])
114 if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
115 errorCount += 1
116 print 'the error rate is: ',float(errorCount)/len(testSet)
117 return vocabList,p0V,p1V
118
119 def getTopWords(ny,sf):
120 import operator
121 vocabList,p0V,p1V=localWords(ny,sf)
122 topNY=[]; topSF=[]
123 for i in range(len(p0V)):
124 if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))
125 if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))
126 sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
127 print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"
128 for item in sortedSF:
129 print item[0]
130 sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
131 print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"
132 for item in sortedNY:
133 print item[0]
134
135 def main():
136 #print stopWords()
137 localWords('http://newyork.craigslist.org/stp/index.rss','http://sfbay.craigslist.org/stp/index.rss')
138
139 if __name__ == '__main__':