寫在之前
本書涉及的源程序和數(shù)據(jù)都可以在以下網(wǎng)站中找到:http://guidetodatamining.com/
這本書理論比較簡單,書中錯誤較少兄旬,動手鍛煉較多刹泄,如果每個代碼都自己寫出來捺疼,收獲不少疏虫。總結(jié):適合入門啤呼。
歡迎轉(zhuǎn)載卧秘,轉(zhuǎn)載請注明出處,如有問題歡迎指正官扣。
合集地址:https://www.zybuluo.com/hainingwyx/note/559139
概率及樸素貝葉斯
特點:分類并給出概率翅敌。
先驗概率:P(h)
后驗概率/條件概率:P(h/d)
# 訓練
class Classifier:
def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
""" a classifier will be built from files with the bucketPrefix
excluding the file with textBucketNumber. dataFormat is a string that
describes how to interpret each line of the data files. For example,
for the iHealth data the format is:
"attr attr attr attr class"
"""
total = 0
classes = {}
counts = {}
# reading the data in from the file
self.format = dataFormat.strip().split('\t')
self.prior = {}
self.conditional = {}
# for each of the buckets numbered 1 through 10:
for i in range(1, 11):
# if it is not the bucket we should ignore, read in the data
if i != testBucketNumber:
filename = "%s-%02i" % (bucketPrefix, i)
f = open(filename)
lines = f.readlines()
f.close()
for line in lines:
fields = line.strip().split('\t')
ignore = []
vector = []
for i in range(len(fields)):
if self.format[i] == 'num':
vector.append(float(fields[i])) #vector!!
elif self.format[i] == 'attr':
vector.append(fields[i])
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
category = fields[i]
# now process this instance
total += 1
classes.setdefault(category, 0) #字典:分類類別計數(shù)
counts.setdefault(category, {}) #復合字典:每類的每列的具體計數(shù)
classes[category] += 1
# now process each attribute of the instance
col = 0
for columnValue in vector:
col += 1
counts[category].setdefault(col, {})
counts[category][col].setdefault(columnValue, 0)
counts[category][col][columnValue] += 1
# ok done counting. now compute probabilities
# first prior probabilities p(h)
for (category, count) in classes.items():
self.prior[category] = count / total#字典:先驗概率
# now compute conditional probabilities p(D|h)
for (category, columns) in counts.items():
self.conditional.setdefault(category, {})
for (col, valueCounts) in columns.items():
self.conditional[category].setdefault(col, {})
for (attrValue, count) in valueCounts.items():
self.conditional[category][col][attrValue] = (
count / classes[category]) #復合字典:每類的每個屬性的條件概率
self.tmp = counts #應該暫時沒有用
# 分類
def classify(self, itemVector):
"""Return class we think item Vector is in"""
results = []
for (category, prior) in self.prior.items():
prob = prior
col = 1
for attrValue in itemVector:
if not attrValue in self.conditional[category][col]:
# we did not find any instances of this attribute value
# occurring with this category so prob = 0
prob = 0
else:
prob = prob * self.conditional[category][col][attrValue]
col += 1
results.append((prob, category))
# return the category with the highest probability
return(max(results)[1])
# test code
c = Classifier("iHealth/i", 10,"attr\tattr\tattr\tattr\tclass")
print(c.classify(['health', 'moderate', 'moderate', 'yes']))
問題:當存在某個概率為0時,直接主導整個貝葉斯的計算過程醇锚,即使其他的獨立事件的條件概率接近于1哼御。此外,基于樣本集估計出來概率往往是真實概率的偏低估計焊唬。
改進:將
修改為
其中n是y事件總數(shù)恋昼,
是y中x事件總數(shù),m是等效樣本容量赶促,通常的確定方法是:m為可選屬性的個數(shù)值液肌,p是可選屬性的概率的先驗估計,通常假設均勻分布鸥滨。
當處理的數(shù)據(jù)是連續(xù)的時候嗦哆,有兩種解決辦法谤祖。一是離散化,構(gòu)建類別老速;一是假設概率分布服從高斯分布粥喜,然后計算概率。
樣本標準差:
對于樣本集而言橘券,樣本標準差相對于總體標準差計算公式是總體標準差的更優(yōu)估計额湘。
# pdf計算實現(xiàn)
def pdf(mean, ssd, x):
"""Probability Density Function computing P(x|y)
input is the mean, sample standard deviation for all the items in y,
and x."""
ePart = math.pow(math.e, -(x-mean)**2/(2*ssd**2))
print (ePart)
return (1.0 / (math.sqrt(2*math.pi)*ssd)) * ePart
# 連續(xù)數(shù)據(jù)的訓練
class Classifier:
def __init__(self, bucketPrefix, testBucketNumber, dataFormat):
""" a classifier will be built from files with the bucketPrefix
excluding the file with textBucketNumber. dataFormat is a string that
describes how to interpret each line of the data files. For example,
for the iHealth data the format is:
"attr attr attr attr class"
"""
total = 0
classes = {}
# counts used for attributes that are not numeric
counts = {}
# totals used for attributes that are numereric
# we will use these to compute the mean and sample standard deviation for
# each attribute - class pair.
totals = {}
numericValues = {}
# reading the data in from the file
self.format = dataFormat.strip().split('\t')
#
self.prior = {}
self.conditional = {}
# for each of the buckets numbered 1 through 10:
for i in range(1, 11):
# if it is not the bucket we should ignore, read in the data
if i != testBucketNumber:
filename = "%s-%02i" % (bucketPrefix, i)
f = open(filename)
lines = f.readlines()
f.close()
for line in lines:
fields = line.strip().split('\t')
ignore = []
vector = []
nums = []
for i in range(len(fields)):
if self.format[i] == 'num':
nums.append(float(fields[i]))
elif self.format[i] == 'attr':
vector.append(fields[i])
elif self.format[i] == 'comment':
ignore.append(fields[i])
elif self.format[i] == 'class':
category = fields[i]
# now process this instance
total += 1
classes.setdefault(category, 0)
counts.setdefault(category, {})
totals.setdefault(category, {})
numericValues.setdefault(category, {})
classes[category] += 1
# now process each non-numeric attribute of the instance
col = 0
for columnValue in vector:
col += 1
counts[category].setdefault(col, {})
counts[category][col].setdefault(columnValue, 0)
counts[category][col][columnValue] += 1
# process numeric attributes
col = 0
for columnValue in nums:
col += 1
totals[category].setdefault(col, 0)
#totals[category][col].setdefault(columnValue, 0)
totals[category][col] += columnValue
numericValues[category].setdefault(col, [])
numericValues[category][col].append(columnValue)
#
# ok done counting. now compute probabilities
#
# first prior probabilities p(h)
#
for (category, count) in classes.items():
self.prior[category] = count / total
#
# now compute conditional probabilities p(h|D)
#
for (category, columns) in counts.items():
self.conditional.setdefault(category, {})
for (col, valueCounts) in columns.items():
self.conditional[category].setdefault(col, {})
for (attrValue, count) in valueCounts.items():
self.conditional[category][col][attrValue] = (
count / classes[category])
self.tmp = counts
#
# now compute mean and sample standard deviation
#
self.means = {}
self.totals = totals
for (category, columns) in totals.items():
self.means.setdefault(category, {})
for (col, cTotal) in columns.items():
self.means[category][col] = cTotal / classes[category]
# standard deviation
self.ssd = {}
for (category, columns) in numericValues.items():
self.ssd.setdefault(category, {})
for (col, values) in columns.items():
SumOfSquareDifferences = 0
theMean = self.means[category][col]
for value in values:
SumOfSquareDifferences += (value - theMean)**2
columns[col] = 0
self.ssd[category][col] = math.sqrt(SumOfSquareDifferences / (classes[category] - 1))
# 連續(xù)數(shù)據(jù)的分類
def classify(self, itemVector, numVector):
"""Return class we think item Vector is in"""
results = []
sqrt2pi = math.sqrt(2 * math.pi)
for (category, prior) in self.prior.items():
prob = prior
col = 1
for attrValue in itemVector:
if not attrValue in self.conditional[category][col]:
# we did not find any instances of this attribute value
# occurring with this category so prob = 0
prob = 0
else:
prob = prob * self.conditional[category][col][attrValue]
col += 1
col = 1
for x in numVector:
mean = self.means[category][col]
ssd = self.ssd[category][col]
ePart = math.pow(math.e, -(x - mean)**2/(2*ssd**2))
prob = prob * ((1.0 / (sqrt2pi*ssd)) * ePart)
col += 1
results.append((prob, category))
# return the category with the highest probability
#print(results)
return(max(results)[1])
貝葉斯和kNN的比較
- 貝葉斯優(yōu)點:實現(xiàn)簡單,和其他方法相比需要的訓練數(shù)據(jù)更少
- 貝葉斯缺點:不能學習到特征之間的相互作用旁舰。
- kNN優(yōu)點:實現(xiàn)簡單锋华,不用考慮數(shù)據(jù)特定的結(jié)構(gòu),需要大量的內(nèi)存來存儲訓練集
- kNN缺點:訓練集很大的時候是一個合理的選擇箭窜。
許多真實數(shù)據(jù)挖掘問題中毯焕,很多屬性不是獨立的。有時候可以假設獨立磺樱。之所以稱樸素貝葉斯是因為盡管知道不成立仍然假設屬性之間是獨立的纳猫。