Python機器學習初步——第一部分

# coding = UTF-8

# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# machine_classfied_ldmwp.py
# @簡介:用scikit-learn估計器分類
# @作者:Glen
# @日期:2016.8.13
# @資料來源:Python數(shù)據(jù)挖掘入門與實踐
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

# --------------------------------------------------
# 用Python編寫的scikit-learn庫,實現(xiàn)了一系列數(shù)據(jù)挖掘算法斋陪,
# 提供通用編程接口翰撑、標準化的測試和調參工具。
# 基本概念:
# - 估計器(Estimator):用于分類匠题、聚類和回歸處理
# - 轉換器(Transformer):用于數(shù)據(jù)預處理和數(shù)據(jù)轉換
# - 流水線(Pipeline):組合數(shù)據(jù)挖掘流程拯坟,便于再次使用
# --------------------------------------------------

# ------------------------------------------
# scikit-learn估計器
# 估計器用于分類任務,主要包括以下兩個函數(shù)韭山。
# - fit():訓練算法郁季,設置內部參數(shù)
# - predict():參數(shù)為測試集。
# ------------------------------------------

# ----------------------------------------
# 近鄰算法
# 計算近鄰的重要之處在于對距離的度量
# 常用的度量方法有:歐式距離钱磅、曼哈頓距離和余弦距離
# 距離的選擇方式對結果可能會產(chǎn)生重要的影響
# -----------------------------------------

import sys
import pickle
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from collections import defaultdict
from operator import itemgetter

# 導入數(shù)據(jù)
F = open(r'E:\github\workrobot\workrobot\data\ceic\random_datasets.pkl', 'rb')
datasets = pickle.load(F)

dataset_choosed = datasets[0]

# 變量的中英文轉換
columns_chinese = dataset_choosed.columns
columns_english = ['_'.join(['v',str(item)]) for item in range(len(columns_chinese))]
columns_mapping = dict(zip(columns_english, columns_chinese))

# 設定數(shù)據(jù)框的新變量
dataset_choosed.columns = columns_english

# 分類數(shù)據(jù)
y = dataset_choosed['v_0'].gt(dataset_choosed['v_0'].mean())
x = dataset_choosed.iloc[:,range(1,len(columns_english))]

# 劃分數(shù)據(jù)集
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=14)

# K近鄰分類器
estimator = KNeighborsClassifier()
# 估計
estimator.fit(x_train, y_train)

# 預測
y_predicted = estimator.predict(x_test)

# 打印結果
accuracy = np.mean(y_test == y_predicted) * 100
print('The accuracy is {0:.1f}%.'.format(accuracy))

# 交叉檢驗
scores = cross_val_score(estimator,x,y,scoring='accuracy')
average_accuracy = np.mean(scores) * 100
print('The average accuracy is {0:.1f}%.'.format(average_accuracy))

# 設置參數(shù)
avg_scores = []
all_scores = []
parameter_values = list(range(1,21))
for n_neighbors in parameter_values:
    estimator = KNeighborsClassifier(n_neighbors=n_neighbors)
    scores = cross_val_score(estimator, x, y, scoring='accuracy')
    avg_scores.append(np.mean(scores))
    all_scores.append(scores)

# 標準預處理
x_transformed = MinMaxScaler().fit_transform(x)
estimator = KNeighborsClassifier()
transformed_scores = cross_val_score(estimator, x_transformed, y, scoring='accuracy')
print('The average accuracy is {0:.1f}%.'.format(np.mean(transformed_scores) * 100))

# 流水線
scaling_pipeline = Pipeline([('scale', MinMaxScaler()), ('predict', KNeighborsClassifier())])
scores = cross_val_score(scaling_pipeline, x, y, scoring='accuracy')
print('The pipeline scored an average accuracy is {0:.1f}%.'.format(np.mean(scores) * 100))

# ---------------------------------------
# 分類算法 —— 決策樹
# @簡介:一種有監(jiān)督的機器學習算法
# @方法:scikit-learn庫實現(xiàn)了分類回歸樹(CART)
# ----------------------------------------

print('-'*50)
print('CART')

# 創(chuàng)建對象
clf = DecisionTreeClassifier(random_state=14)
transformed_scores = cross_val_score(clf, x_transformed, y, scoring='accuracy')
print('The average accuracy is {0:.1f}%.'.format(np.mean(transformed_scores) * 100))

# 隨機森林
clf = RandomForestClassifier(random_state=14)
transformed_scores = cross_val_score(clf, x_transformed, y, scoring='accuracy')
print('The average accuracy is {0:.1f}%.'.format(np.mean(transformed_scores) * 100))

'''
# 搜索最佳參數(shù)
parameter_space = {'max_features':[2, 40, 'auto'],
                   'n_estimators': [100, ],
                   'criterion': ['gini', 'entropy'],
                   'min_samples_leaf': [2, 4, 6]}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(x, y)
print('Accuracy: {0:.1f}%.'.format(grid.best_score_ * 100))
print(grid.best_estimator_)'''

# --------------------------
# 親和性分析 —— 電影推薦
# @算法:Apriori算法
# __________________________

print('\n---------------Apriori-------------')

# 導入數(shù)據(jù)
ratings_filename = r'E:\data\bigdata\movies\u.data'
all_ratings = pd.read_csv(ratings_filename, delimiter='\t', header=None,
                          names = ['UserID', 'MovieID', 'Rating', 'Datetime'])
all_ratings['Datetime'] = pd.to_datetime(all_ratings['Datetime'], unit='s')
print(all_ratings[:5])

all_ratings['Favorable'] = all_ratings['Rating'] > 3
print(all_ratings[10:15])

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# 變量all_ratings
#     UserID  MovieID  Rating            Datetime Favorable
# 10      62      257       2 1997-11-12 22:07:14     False
# 11     286     1014       5 1997-11-17 15:38:45      True
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# 選擇前200名用戶的打分數(shù)據(jù)用作訓練集
ratings = all_ratings[all_ratings['UserID'].isin(range(200))]

# 只包含用戶喜歡某部電影的數(shù)據(jù)行
favorable_ratings = ratings[ratings["Favorable"]]

# 字典梦裂,key是用戶id,value是該用戶喜歡的電影的id集合
favorable_reviews_by_users = dict((k, frozenset(v.values)) for k, v in favorable_ratings.groupby("UserID")["MovieID"])
for k, v in favorable_ratings.groupby("UserID")['MovieID']:
    print(k,' --> ',list(v))

# 每部電影影迷的數(shù)量
num_favorable_by_movie = ratings[["MovieID", "Favorable"]].groupby("MovieID").sum()
print(num_favorable_by_movie.sort_values("Favorable", ascending=False)[:5])

print('----------------------Start---------------------')

def find_frequent_itemsets(favorable_reviews_by_users, k_1_itemsets, min_support):
    counts = defaultdict(int)
    for user, reviews in favorable_reviews_by_users.items():
        for itemset in k_1_itemsets:
            if itemset.issubset(reviews):
                for other_reviewed_movie in reviews - itemset:
                    current_superset = itemset | frozenset((other_reviewed_movie,))
                    counts[current_superset] += 1
    return dict([(itemset, frequency) for itemset, frequency in counts.items() if frequency >= min_support])


# 頻繁項集
frequent_itemsets = {}  # itemsets are sorted by length
# 最小支持度
min_support = 50

#生成初始的頻繁項集
frequent_itemsets[1] = dict((frozenset((movie_id,)), row["Favorable"])
                                for movie_id, row in num_favorable_by_movie.iterrows()
                                if row["Favorable"] > min_support)
print(frequent_itemsets)

print("There are {} movies with more than {} favorable reviews".format(len(frequent_itemsets[1]), min_support))
sys.stdout.flush()

# 遍歷生成頻繁項集
for k in range(2, 20):
    # Generate candidates of length k, using the frequent itemsets of length k-1
    # Only store the frequent itemsets
    cur_frequent_itemsets = find_frequent_itemsets(favorable_reviews_by_users, frequent_itemsets[k-1],
                                                   min_support)
    if len(cur_frequent_itemsets) == 0:
        print("Did not find any frequent itemsets of length {}".format(k))
        sys.stdout.flush()
        break
    else:
        print("I found {} frequent itemsets of length {}".format(len(cur_frequent_itemsets), k))
        #print(cur_frequent_itemsets)
        sys.stdout.flush()
        frequent_itemsets[k] = cur_frequent_itemsets
# We aren't interested in the itemsets of length 1, so remove those
del frequent_itemsets[1]

# 頻繁項集生成完畢盖淡,現(xiàn)在需要生成一些統(tǒng)計量
# Now we create the association rules. First, they are candidates until the confidence has been tested
candidate_rules = []
for itemset_length, itemset_counts in frequent_itemsets.items():
    for itemset in itemset_counts.keys():
        for conclusion in itemset:
            premise = itemset - set((conclusion,))
            candidate_rules.append((premise, conclusion))
print("There are {} candidate rules".format(len(candidate_rules)))

# candidate_rules變量是字典年柠,key是前提,value是結論
print(candidate_rules[:5])

# 計算每條規(guī)則的置信度
# Now, we compute the confidence of each of these rules. This is very similar to what we did in chapter 1
correct_counts = defaultdict(int)
incorrect_counts = defaultdict(int)
for user, reviews in favorable_reviews_by_users.items():
    for candidate_rule in candidate_rules:
        premise, conclusion = candidate_rule
        if premise.issubset(reviews):
            if conclusion in reviews:
                correct_counts[candidate_rule] += 1
            else:
                incorrect_counts[candidate_rule] += 1
rule_confidence = {candidate_rule: correct_counts[candidate_rule] / float(correct_counts[candidate_rule] + incorrect_counts[candidate_rule])
              for candidate_rule in candidate_rules}

# 根據(jù)置信度排序
sorted_confidence = sorted(rule_confidence.items(), key=itemgetter(1), reverse=True)
for index in range(5):
    print("Rule #{0}".format(index + 1))
    (premise, conclusion) = sorted_confidence[index][0]
    print("Rule: If a person recommends {0} they will also recommend {1}".format(premise, conclusion))
    print(" - Confidence: {0:.3f}".format(rule_confidence[(premise, conclusion)]))
    print("")

# ---------------------------------------
# 用轉換器抽取特征
# ----------------------------------------

# 模型就是用來簡化世界褪迟,特征抽取也是一樣冗恨。
# 降低復雜性有好處,但也有不足味赃,簡化會忽略很多細節(jié)掀抹。

# 這里的例子用adult數(shù)據(jù)集,預測一個人是否年收入多于五萬美元

adult_filename = r'E:\data\bigdata\adult\adult.data'
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
                                                        "Education-Num", "Marital-Status", "Occupation",
                                                        "Relationship", "Race", "Sex", "Capital-gain",
                                                        "Capital-loss", "Hours-per-week", "Native-Country",
                                                        "Earnings-Raw"])
print(adult.head)
最后編輯于
?著作權歸作者所有,轉載或內容合作請聯(lián)系作者
  • 序言:七十年代末心俗,一起剝皮案震驚了整個濱河市傲武,隨后出現(xiàn)的幾起案子,更是在濱河造成了極大的恐慌,老刑警劉巖谱轨,帶你破解...
    沈念sama閱讀 211,348評論 6 491
  • 序言:濱河連續(xù)發(fā)生了三起死亡事件戒幔,死亡現(xiàn)場離奇詭異,居然都是意外死亡土童,警方通過查閱死者的電腦和手機诗茎,發(fā)現(xiàn)死者居然都...
    沈念sama閱讀 90,122評論 2 385
  • 文/潘曉璐 我一進店門,熙熙樓的掌柜王于貴愁眉苦臉地迎上來献汗,“玉大人敢订,你說我怎么就攤上這事“粘裕” “怎么了楚午?”我有些...
    開封第一講書人閱讀 156,936評論 0 347
  • 文/不壞的土叔 我叫張陵,是天一觀的道長尿招。 經(jīng)常有香客問我矾柜,道長,這世上最難降的妖魔是什么就谜? 我笑而不...
    開封第一講書人閱讀 56,427評論 1 283
  • 正文 為了忘掉前任怪蔑,我火速辦了婚禮,結果婚禮上丧荐,老公的妹妹穿的比我還像新娘缆瓣。我一直安慰自己,他們只是感情好虹统,可當我...
    茶點故事閱讀 65,467評論 6 385
  • 文/花漫 我一把揭開白布弓坞。 她就那樣靜靜地躺著,像睡著了一般车荔。 火紅的嫁衣襯著肌膚如雪渡冻。 梳的紋絲不亂的頭發(fā)上,一...
    開封第一講書人閱讀 49,785評論 1 290
  • 那天忧便,我揣著相機與錄音菩帝,去河邊找鬼。 笑死茬腿,一個胖子當著我的面吹牛,可吹牛的內容都是我干的宜雀。 我是一名探鬼主播切平,決...
    沈念sama閱讀 38,931評論 3 406
  • 文/蒼蘭香墨 我猛地睜開眼,長吁一口氣:“原來是場噩夢啊……” “哼辐董!你這毒婦竟也來了悴品?” 一聲冷哼從身側響起,我...
    開封第一講書人閱讀 37,696評論 0 266
  • 序言:老撾萬榮一對情侶失蹤,失蹤者是張志新(化名)和其女友劉穎苔严,沒想到半個月后定枷,有當?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體,經(jīng)...
    沈念sama閱讀 44,141評論 1 303
  • 正文 獨居荒郊野嶺守林人離奇死亡届氢,尸身上長有42處帶血的膿包…… 初始之章·張勛 以下內容為張勛視角 年9月15日...
    茶點故事閱讀 36,483評論 2 327
  • 正文 我和宋清朗相戀三年欠窒,在試婚紗的時候發(fā)現(xiàn)自己被綠了。 大學時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片退子。...
    茶點故事閱讀 38,625評論 1 340
  • 序言:一個原本活蹦亂跳的男人離奇死亡岖妄,死狀恐怖,靈堂內的尸體忽然破棺而出寂祥,到底是詐尸還是另有隱情荐虐,我是刑警寧澤,帶...
    沈念sama閱讀 34,291評論 4 329
  • 正文 年R本政府宣布丸凭,位于F島的核電站福扬,受9級特大地震影響,放射性物質發(fā)生泄漏惜犀。R本人自食惡果不足惜铛碑,卻給世界環(huán)境...
    茶點故事閱讀 39,892評論 3 312
  • 文/蒙蒙 一、第九天 我趴在偏房一處隱蔽的房頂上張望向拆。 院中可真熱鬧亚茬,春花似錦、人聲如沸浓恳。這莊子的主人今日做“春日...
    開封第一講書人閱讀 30,741評論 0 21
  • 文/蒼蘭香墨 我抬頭看了看天上的太陽颈将。三九已至梢夯,卻和暖如春,著一層夾襖步出監(jiān)牢的瞬間晴圾,已是汗流浹背颂砸。 一陣腳步聲響...
    開封第一講書人閱讀 31,977評論 1 265
  • 我被黑心中介騙來泰國打工, 沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留死姚,地道東北人人乓。 一個月前我還...
    沈念sama閱讀 46,324評論 2 360
  • 正文 我出身青樓,卻偏偏與公主長得像都毒,于是被迫代替她去往敵國和親色罚。 傳聞我的和親對象是個殘疾皇子,可洞房花燭夜當晚...
    茶點故事閱讀 43,492評論 2 348

推薦閱讀更多精彩內容