朴素贝叶斯算法的实例 -电脑资料

电脑资料 时间:2019-01-01 我要投稿
【www.unjs.com - 电脑资料】

    贝叶斯的应用

    过滤垃圾邮件

    贝叶斯分类器的著名的应用就是垃圾邮件过滤了,这方面推荐想详细了解的可以去看看《 与画家》或是《数学之美》中对应的章节,贝叶斯的基础实现看这里

    数据集

    两个文件夹,分别是正常邮件和垃圾邮件,其中各有25封邮件

    测试方法

    从50封邮件中随机选取10封做为测试数据

    实现细节

    1.首先我们需要将文本转成我们需要的向量的样子,这里需要使用一点正则表达式

    2.由于采取交叉验证的方式,随机过程会导致每次的结果不尽相同

    复制代码

    1 #coding=utf-8

    2 from numpy import *

    3

    4 #解析文档的函数

    5 def textParse(bigString):

    6  import re

    7  listOfTokens = re.split(r'\W*',bigString)

    8  return [tok.lower() for tok in listOfTokens if len(tok) > 2]

    9

    10

    11 #创建一个带有所有单词的列表

    12 def createVocabList(dataSet):

    13  vocabSet = set([])

    14  for document in dataSet:

    15    vocabSet = vocabSet | set(document)

    16  return list(vocabSet)

    17

    18 def setOfWords2Vec(vocabList, inputSet):

    19  retVocabList = [0] * len(vocabList)

    20  for word in inputSet:

    21    if word in vocabList:

    22      retVocabList[vocabList.index(word)] = 1

    23    else:

    24      print 'word ',word ,'not in dict'

    25  return retVocabList

    26

    27 #另一种模型

    28 def bagOfWords2VecMN(vocabList, inputSet):

    29  returnVec = [0]*len(vocabList)

    30  for word in inputSet:

    31    if word in vocabList:

    32      returnVec[vocabList.index(word)] += 1

    33  return returnVec

    34

    35 def trainNB0(trainMatrix,trainCatergory):

    36  numTrainDoc = len(trainMatrix)

    37  numWords = len(trainMatrix[0])

    38  pAbusive = sum(trainCatergory)/float(numTrainDoc)

    39  #防止多个概率的成绩当中的一个为0

    40  p0Num = ones(numWords)

    41  p1Num = ones(numWords)

    42  p0Denom = 2.0

    43  p1Denom = 2.0

    44  for i in range(numTrainDoc):

    45    if trainCatergory[i] == 1:

    46      p1Num +=trainMatrix[i]

    47      p1Denom += sum(trainMatrix[i])

    48    else:

    49      p0Num +=trainMatrix[i]

    50      p0Denom += sum(trainMatrix[i])

    51  p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零

    52  p0Vect = log(p0Num/p0Denom)

    53  return p0Vect,p1Vect,pAbusive

    54

    55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

    56  p1 = sum(vec2Classify * p1Vec) + log(pClass1)  #element-wise mult

    57  p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)

    58  if p1 > p0:

    59    return 1

    60  else:

    61    return 0

    62

    63 def spamTest(spamFloder, hamFloder):

    64  docList = []

    65  classList = []

    66  fullText = []

    67  for i in range(1,26):

    68    wordList = textParse(open(spamFloder+str(i)+'.txt').read())

    69    docList.append(wordList)

    70    fullText.extend(wordList)

    71    classList.append(1)

    72    wordList = textParse(open(hamFloder+str(i)+'.txt').read())

    73    docList.append(wordList)

    74    fullText.extend(wordList)

    75    classList.append(0)

    76  vocabList = createVocabList(docList)

    77  trainingSet = range(50)

    78  testSet = []

    79  for i in range(10):

    80    randIndex = int(random.uniform(0,len(trainingSet)))

    81    testSet.append(trainingSet[randIndex])

    82    del(trainingSet[randIndex])

    83  trainMat = []

    84  trianClasses = []

    85  print trainingSet

    86  for docIndex in trainingSet:

    87    trainMat.append(setOfWords2Vec(vocabList, docList[docIndex]))

    88    #trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

    89    trianClasses.append(classList[docIndex])

    90  p0V,p1V,pSpam = trainNB0(array(trainMat),array(trianClasses))

    91  errorCount = 0

    92  for docIndex in testSet:    #classify the remaining items

    93    #wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

    94    wordVector = setOfWords2Vec(vocabList, docList[docIndex])

    95    if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:

    96      errorCount += 1

    97      print "classification error",docList[docIndex]

    98  print 'the error rate is: ',float(errorCount)/len(testSet)

    99  #return vocabList,fullText

    100

    101

    102 def main():

    103  spamTest('email/spam/','email/ham/')

    104

    105 if __name__ == '__main__':

    106  main()

    复制代码

    从个人广告中获取地区倾向

    这个是从某个网站上提取了不同地区板块的信息,分析他们的用词是不是有某些规律

    数据集

    这里的数据使用RSS获取的,用到了python的feedparse包,想了解可以看这里.这里分别获取了某网站两个地区板块中的信息

    测试方法

    交叉验证

    实现细节

    1.这里有两种字符需要特别处理(其实他们有很大重合),一种是频率最高的一些,另一种是所谓的停用词(我的理解其实就是那些使用频率很高但没什么实际意义的),各种语言的停用词可以看这里,

朴素贝叶斯算法的实例

。 我们需要移除这些词以使得结果更能体现出地区差异。

    2.getTopWords函数实际上就是对这个概率统计了一下特征。对学习贝叶斯来说不是必要代码

    3.除了数据来源不同实现细节和上面的很相似

    复制代码

    1 #coding=utf-8

    2 from numpy import *

    3

    4 #解析文档的函数

    5 def textParse(bigString):

    6  import re

    7  listOfTokens = re.split(r'\W*',bigString)

    8  return [tok.lower() for tok in listOfTokens if len(tok) > 2]

    9

    10

    11 #创建一个带有所有单词的列表

    12 def createVocabList(dataSet):

    13  vocabSet = set([])

    14  for document in dataSet:

    15    vocabSet = vocabSet | set(document)

    16  return list(vocabSet)

    17

    18 def setOfWords2Vec(vocabList, inputSet):

    19  retVocabList = [0] * len(vocabList)

    20  for word in inputSet:

    21    if word in vocabList:

    22      retVocabList[vocabList.index(word)] = 1

    23    else:

    24      print 'word ',word ,'not in dict'

    25  return retVocabList

    26

    27 #另一种模型

    28 def bagOfWords2VecMN(vocabList, inputSet):

    29  returnVec = [0]*len(vocabList)

    30  for word in inputSet:

    31    if word in vocabList:

    32      returnVec[vocabList.index(word)] += 1

    33  return returnVec

    34

    35 def trainNB0(trainMatrix,trainCatergory):

    36  numTrainDoc = len(trainMatrix)

    37  numWords = len(trainMatrix[0])

    38  pAbusive = sum(trainCatergory)/float(numTrainDoc)

    39  #防止多个概率的成绩当中的一个为0

    40  p0Num = ones(numWords)

    41  p1Num = ones(numWords)

    42  p0Denom = 2.0

    43  p1Denom = 2.0

    44  for i in range(numTrainDoc):

    45    if trainCatergory[i] == 1:

    46      p1Num +=trainMatrix[i]

    47      p1Denom += sum(trainMatrix[i])

    48    else:

    49      p0Num +=trainMatrix[i]

    50      p0Denom += sum(trainMatrix[i])

    51  p1Vect = log(p1Num/p1Denom)#处于精度的考虑,否则很可能到限归零

    52  p0Vect = log(p0Num/p0Denom)

    53  return p0Vect,p1Vect,pAbusive

    54

    55 def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):

    56  p1 = sum(vec2Classify * p1Vec) + log(pClass1)  #element-wise mult

    57  p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)

    58  if p1 > p0:

    59    return 1

    60  else:

    61    return 0

    62

    63 def stopWords():

    64  stopW = []

    65  f = open('stopwords.txt').readlines()

    66  for eachLine in f:

    67    stopW.append(eachLine[:-1])

    68  return stopW

    69

    70 def calcMostFreq(vocabList,fullText):

    71  import operator

    72  freqDict = {}

    73  for token in vocabList:

    74    freqDict[token]=fullText.count(token)

    75  sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)

    76  return sortedFreq[:30]

    77

    78 def localWords(rss1,rss0):

    79  import feedparser

    80  feed1 = feedparser.parse(rss1)

    81  feed0 = feedparser.parse(rss0)

    82  docList=[]; classList = []; fullText =[]

    83  minLen = min(len(feed1['entries']),len(feed0['entries']))

    84  for i in range(minLen):

    85    wordList = textParse(feed1['entries'][i]['summary'])

    86    docList.append(wordList)

    87    fullText.extend(wordList)

    88    classList.append(1) #NY is class 1

    89    wordList = textParse(feed0['entries'][i]['summary'])

    90    docList.append(wordList)

    91    fullText.extend(wordList)

    92    classList.append(0)

    93  vocabList = createVocabList(docList)#create vocabulary

    94  top30Words = calcMostFreq(vocabList,fullText) #remove top 30 words

    95  for pairW in top30Words:

    96    if pairW[0] in vocabList: vocabList.remove(pairW[0])

    97  stopW = stopWords()

    98  for pairW in stopW:

    99    if pairW[0] in vocabList:

    100      vocabList.remove(pairW[0])

    101  trainingSet = range(2*minLen); testSet=[]     #create test set

    102  for i in range(20):

    103    randIndex = int(random.uniform(0,len(trainingSet)))

    104    testSet.append(trainingSet[randIndex])

    105    del(trainingSet[randIndex])

    106  trainMat=[]; trainClasses = []

    107  for docIndex in trainingSet:#train the classifier (get probs) trainNB0

    108    trainMat.append(bagOfWords2VecMN(vocabList, docList[docIndex]))

    109    trainClasses.append(classList[docIndex])

    110  p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))

    111  errorCount = 0

    112  for docIndex in testSet:    #classify the remaining items

    113    wordVector = bagOfWords2VecMN(vocabList, docList[docIndex])

    114    if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:

    115      errorCount += 1

    116  print 'the error rate is: ',float(errorCount)/len(testSet)

    117  return vocabList,p0V,p1V

    118

    119 def getTopWords(ny,sf):

    120  import operator

    121  vocabList,p0V,p1V=localWords(ny,sf)

    122  topNY=[]; topSF=[]

    123  for i in range(len(p0V)):

    124    if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V[i]))

    125    if p1V[i] > -6.0 : topNY.append((vocabList[i],p1V[i]))

    126  sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)

    127  print "SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**"

    128  for item in sortedSF:

    129    print item[0]

    130  sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)

    131  print "NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**"

    132  for item in sortedNY:

    133    print item[0]

    134

    135 def main():

    136  #print stopWords()

    137  localWords('http://newyork.craigslist.org/stp/index.rss','http://sfbay.craigslist.org/stp/index.rss')

    138

    139 if __name__ == '__main__':

最新文章