import pandas;
from pandas import read_csv;
data = read_csv(
'D:\\PDM\\4.4\\data.csv',
encoding='utf8'
)
data = data.dropna()
data.shape
dummyColumns = [
'Gender', 'Home Ownership',
'Internet Connection', 'Marital Status',
'Movie Selector', 'Prerec Format', 'TV Signal'
]#確定模糊變量的屬性
for column in dummyColumns:
data[column]=data[column].astype('category')
dummiesData = pandas.get_dummies(
data,
columns=dummyColumns,
prefix=dummyColumns,
prefix_sep=" ",
drop_first=True
)#模糊變量的數(shù)據(jù)框的生成
data.Gender.unique()
dummiesData.columns
"""
博士后? ? Post-Doc
博士? ? ? Doctorate
碩士? ? ? Master's Degree
學(xué)士? ? ? Bachelor's Degree
副學(xué)士? ? Associate's Degree
專業(yè)院校? Some College
職業(yè)學(xué)校? Trade School
高中? ? ? High School
小學(xué)? ? ? Grade School
"""
educationLevelDict = {
'Post-Doc': 9,
'Doctorate': 8,
'Master\'s Degree': 7,
'Bachelor\'s Degree': 6,
'Associate\'s Degree': 5,
'Some College': 4,
'Trade School': 3,
'High School': 2,
'Grade School': 1
}
dummiesData['Education Level Map'] = dummiesData['Education Level'].map(educationLevelDict)#利用map函數(shù)確定模糊變量的層次
freqMap = {
'Never': 0,
'Rarely': 1,
'Monthly': 2,
'Weekly': 3,
'Daily': 4
}#確定等級手报,有點類似RMF中的分組分層蚯舱。
dummiesData['PPV Freq Map'] = dummiesData['PPV Freq'].map(freqMap)
dummiesData['Theater Freq Map'] = dummiesData['Theater Freq'].map(freqMap)
dummiesData['TV Movie Freq Map'] = dummiesData['TV Movie Freq'].map(freqMap)
dummiesData['Prerec Buying Freq Map'] = dummiesData['Prerec Buying Freq'].map(freqMap)
dummiesData['Prerec Renting Freq Map'] = dummiesData['Prerec Renting Freq'].map(freqMap)
dummiesData['Prerec Viewing Freq Map'] = dummiesData['Prerec Viewing Freq'].map(freqMap)#確定以上這些屬性的值的層次
dummiesData.columns
dummiesSelect = [
'Age', 'Num Bathrooms', 'Num Bedrooms', 'Num Cars', 'Num Children', 'Num TVs',
'Education Level Map', 'PPV Freq Map', 'Theater Freq Map', 'TV Movie Freq Map',
'Prerec Buying Freq Map', 'Prerec Renting Freq Map', 'Prerec Viewing Freq Map',
'Gender Male',
'Internet Connection DSL', 'Internet Connection Dial-Up',
'Internet Connection IDSN', 'Internet Connection No Internet Connection',
'Internet Connection Other',
'Marital Status Married', 'Marital Status Never Married',
'Marital Status Other', 'Marital Status Separated',
'Movie Selector Me', 'Movie Selector Other', 'Movie Selector Spouse/Partner',
'Prerec Format DVD', 'Prerec Format Laserdisk', 'Prerec Format Other',
'Prerec Format VHS', 'Prerec Format Video CD',
'TV Signal Analog antennae', 'TV Signal Cable',
'TV Signal Digital Satellite', 'TV Signal Don\'t watch TV'
]#自變量選擇
inputData = dummiesData[dummiesSelect]
outputData = dummiesData[['Home Ownership Rent']]#因變量的確定
from sklearn import linear_model#導(dǎo)入線性模型
lrModel = linear_model.LogisticRegression()
lrModel.fit(inputData, outputData)
lrModel.score(inputData, outputData)#確定匹配得分
newData = read_csv(
'D:\\PDM\\4.4\\newData.csv',
encoding='utf8'
)#導(dǎo)入新的數(shù)據(jù)
for column in dummyColumns:
newData[column] = newData[column].astype(
'category',
categories=data[column].cat.categories
)#確定模糊類別
newData = newData.dropna()
newData['Education Level Map'] = newData['Education Level'].map(educationLevelDict)
newData['PPV Freq Map'] = newData['PPV Freq'].map(freqMap)
newData['Theater Freq Map'] = newData['Theater Freq'].map(freqMap)
newData['TV Movie Freq Map'] = newData['TV Movie Freq'].map(freqMap)
newData['Prerec Buying Freq Map'] = newData['Prerec Buying Freq'].map(freqMap)
newData['Prerec Renting Freq Map'] = newData['Prerec Renting Freq'].map(freqMap)
newData['Prerec Viewing Freq Map'] = newData['Prerec Viewing Freq'].map(freqMap)
dummiesNewData = pandas.get_dummies(
newData,
columns=dummyColumns,
prefix=dummyColumns,
prefix_sep=" ",
drop_first=True
)#處理模糊變量
inputNewData = dummiesNewData[dummiesSelect]
lrModel.predict(inputData)
#輸出預(yù)測的結(jié)果。
虛擬變量(dummy variables)
虛擬變量掩蛤,也叫啞變量和離散特征編碼枉昏,可用來表示分類變量、非數(shù)量因素可能產(chǎn)生的影響揍鸟。
① 離散特征的取值之間有大小的意義
例如:尺寸(L兄裂、XL、XXL)
離散特征的取值有大小意義的處理函數(shù)map
pandas.Series.map(dict)
參數(shù) dict:映射的字典
② 離散特征的取值之間沒有大小的意義
pandas.get_dummies
例如:顏色(Red,Blue,Green)
處理函數(shù):
get_dummies(data,prefix=None,prefix_sep="_",dummy_na=False,columns=None,drop_first=False)
① data ? 要處理的DataFrame
② prefix 列名的前綴阳藻,在多個列有相同的離散項時候使用
③ prefix_sep 前綴和離散值的分隔符,默認(rèn)為下劃線,默認(rèn)即可
④ dummy_na 是否把NA值臂痕,作為一個離散值進行處理尤慰,默認(rèn)為不處理
⑤ columns 要處理的列名,如果不指定該列道川,那么默認(rèn)處理所有列
⑥ drop_first 是否從備選項中刪除第一個午衰,建模的時候為避免共線性使用