python學習的第三天
1.三國TOP10人物分析
1.讀取小說內(nèi)容
2.分詞
3.詞語過濾惫叛,刪除無關詞翠霍、重復分詞
4.排序
5.得出結(jié)論
import jieba
# 1. 讀取小說內(nèi)容
with open('./novel/threekingdom.txt', 'r', encoding='utf-8') as f:
words = f.read()
counts = {} #{'曹操': 234, '回寨': 56}
# 2.分詞
words_list = jieba.lcut(words)
for word in words_list:
if len(word) <= 1:
continue
else:
#向字典中更新字典中的值
#counts[word] = 取出字典中原來鍵對應的值 + 1
# counts[word] = counts[word] + 1 counts[word]沒有就會報錯
#字典.get(k) 如果字典中沒有這個鍵 返回 none
counts[word] = counts.get(word, 0) + 1
print(counts)
# 3.詞語過濾,刪除無關詞遣疯、重復分詞
# 4.排序 [(), ()]
items = list(counts.items())
print('排序前的列表', items)
def sort_by_count(x):
return x[1]
items.sort(key=sort_by_count, reverse=True)
for i in range(20):
#序列解包
role, count = items[i]
print(role, count)
排除不是人名的分詞清酥,合并人名扶镀,然后排出top10
exclude = {"將軍", "卻說", "丞相", "二人", "不可", "荊州", "不能", "如此", "商議",
"如何", "主公", "軍士", "軍馬", "左右", "次日", "引兵", "大喜", "天下",
"東吳", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人馬", "不知",
"孔明曰", "玄德曰", "劉備", "云長"}
counts['孔明'] = counts['孔明'] + counts['孔明曰']
counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['劉備']
counts['關公'] = counts['關公'] + counts['云長']
for word in exclude:
del counts[word]
最終代碼:(其中collocations=False :取消相鄰兩個重復詞之間的匹配)
import jieba
from wordcloud import WordCloud
import imageio
# 1. 讀取小說內(nèi)容
with open('./novel/threekingdom.txt', 'r', encoding='utf-8') as f:
words = f.read()
counts = {} #{'曹操': 234, '回寨': 56}
exclude = {"將軍", "卻說", "丞相", "二人", "不可", "荊州", "不能", "如此", "商議",
"如何", "主公", "軍士", "軍馬", "左右", "次日", "引兵", "大喜", "天下",
"東吳", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人馬", "不知",
"孔明曰", "玄德曰", "劉備", "云長"}
# 2.分詞
words_list = jieba.lcut(words)
for word in words_list:
if len(word) <= 1:
continue
else:
#向字典中更新字典中的值
#counts[word] = 取出字典中原來鍵對應的值 + 1
# counts[word] = counts[word] + 1 counts[word]沒有就會報錯
#字典.get(k) 如果字典中沒有這個鍵 返回 none
counts[word] = counts.get(word, 0) + 1
print(counts)
# 3.詞語過濾,刪除無關詞焰轻、重復分詞
counts['孔明'] = counts['孔明'] + counts['孔明曰']
counts['玄德'] = counts['玄德'] + counts['玄德曰'] + counts['劉備']
counts['關公'] = counts['關公'] + counts['云長']
for word in exclude:
del counts[word]
# 4.排序 [(), ()]
items = list(counts.items())
print('排序前的列表', items)
def sort_by_count(x):
return x[1]
items.sort(key=sort_by_count, reverse=True)
li = [] # ['孔明',孔明,孔明,'曹操'臭觉。。辱志。蝠筑。。]
for i in range(10):
#序列解包
role, count = items[i]
print(role, count)
# _是告訴看代碼的人揩懒,循環(huán)里面不需要使用臨時變量
for _ in range(count):
li.append(role)
# 5.得出結(jié)論
mask = imageio.imread('./china.jpg')
text = ' '.join(li)
WordCloud(
font_path='msyh.ttc',
background_color='white',
width=800,
height=600,
mask=mask,
# 相鄰兩個重復詞之間的匹配
collocations=False
).generate(text).to_file('top10.png')
2.匿名函數(shù)
# 匿名函數(shù)
# 結(jié)構(gòu)
# lambda x1, x2....xn: 表達式
sum_num = lambda x1, x2: x1+x2
print(sum_num(2, 3))
# # 參數(shù)可以是無限多個什乙,但是表達式只有一個
name_info_list = [
('張三',4500),
('李四',9900),
('王五',2000),
('趙六',5500),
]
name_info_list.sort(key=lambda x:x[1], reverse=True)
print(name_info_list)
stu_info = [
{"name":'zhangsan', "age":18},
{"name":'lisi', "age":30},
{"name":'wangwu', "age":99},
{"name":'tiaqi', "age":3},
]
stu_info.sort(key=lambda i:i['age'])
print(stu_info)
# 列表推導式,列表解析個字典解析
# 之前我們使用普通for 創(chuàng)建列表
li = []
for i in range(10):
li.append(i)
print(li)
# # 使用列表推導式
# # [表達式 for 臨時變量 in 可迭代對象 可以追加條件]
print([i for i in range(10)])
# 列表解析
# # 篩選出列表中所有的偶數(shù)
li = []
for i in range(10):
if i%2 == 0:
li.append(i)
print(li)
# # 使用列表解析
print([i for i in range(10) if i%2 == 0])
# 篩選出列表中 大于0 的數(shù)
from random import randint
num_list = [randint(-10, 10) for _ in range(10)]
print(num_list)
print([i for i in num_list if i>0])
# 字典解析
# 生成100個學生的成績
stu_grades = {'student{}'.format(i):randint(50, 100) for i in range(1, 101)}
print(stu_grades)
# 篩選大于 60分的所有學生
print({k: v for k, v in stu_grades.items() if v >60})
3. Matplotlib
Matplotlib 是一個Python的2D繪圖庫已球,它以各種硬拷貝格式和跨平臺的交互式環(huán)境生成出版質(zhì)量級別的圖形 臣镣。
通過 Matplotlib辅愿,開發(fā)者可以僅需要幾行代碼,便可以生成繪圖退疫,直方圖渠缕,功率譜,條形圖褒繁,錯誤圖,散點圖等馍忽。
繪制圖形
# matplotlib
# 導入
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
# # 使用100個點 繪制 [0 , 2π]正弦曲線圖
# #.linspace 左閉右閉區(qū)間的等差數(shù)列
x = np.linspace(0, 2*np.pi, num=100)
print(x)
y = np.sin(x)
# # 正弦和余弦在同一坐標系下
cosy = np.cos(x)
plt.plot(x, y, color='g', linestyle='--',label='sin(x)')
plt.plot(x, cosy, color='r',label='cos(x)')
plt.xlabel('時間(s)')
plt.ylabel('電壓(V)')
plt.title('歡迎來到python世界')
# # 圖例
plt.legend()
plt.show()
# 繪制柱狀圖
import string
from random import randint
# print(string.ascii_uppercase[0:6])
# ['A', 'B', 'C'...]
x = ['口紅{}'.format(x) for x in string.ascii_uppercase[:5] ]
y = [randint(200, 500) for _ in range(5)]
print(x)
print(y)
plt.xlabel('口紅品牌')
plt.ylabel('價格(元)')
plt.bar(x, y)
plt.show()
#繪制餅圖
from random import randint
import string
counts = [randint(3500, 9000) for _ in range(6)]
labels = ['員工{}'.format(x) for x in string.ascii_lowercase[:6] ]
# # 距離圓心點距離
explode = [0.1,0,0, 0, 0,0]
colors = ['red', 'purple','blue', 'yellow','gray','green']
plt.pie(counts,explode = explode,shadow=True, labels=labels, autopct = '%1.1f%%',colors=colors)
plt.legend(loc=2)
plt.axis('equal')
plt.show()
# 繪制散點圖
# 均值為 0 標準差為1 的正太分布數(shù)據(jù)
x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)
plt.scatter(x, y)
plt.show()
x = np.random.normal(0, 1, 1000000)
y = np.random.normal(0, 1, 1000000)
# alpha透明度
plt.scatter(x, y, alpha=0.1)
plt.show()
4.練習
4.1 紅樓夢TOP10人物分析
import jieba
from wordcloud import WordCloud
# 1.讀取小說內(nèi)容
with open('./all.txt', 'r', encoding='utf-8') as f:
words = f.read()
counts = {}
excludes = {"什么", "一個", "我們", "你們", "如今", "說道", "知道", "起來", "這里",
"出來", "眾人", "那里", "自己", "一面", "只見", "太太", "兩個", "沒有",
"怎么", "不是", "不知", "這個", "聽見", "這樣", "進來", "咱們", "就是",
"老太太", "東西", "告訴", "回來", "只是", "大家", "姑娘", "奶奶", "鳳姐兒"}
# 2. 分詞
words_list = jieba.lcut(words)
# print(words_list)
for word in words_list:
if len(word) <= 1:
continue
else:
# 更新字典中的值
# counts[word] = 取出字典中原來鍵對應的值 + 1
# counts[word] = counts[word] + 1 # counts[word]如果沒有就要報錯
# 字典棒坏。get(k) 如果字典中沒有這個鍵 返回 NONE
counts[word] = counts.get(word, 0) + 1
print(len(counts))
# 3. 詞語過濾,刪除無關詞,重復詞
counts['賈母'] = counts['老太太'] + counts['賈母']
counts['林黛玉'] = counts['林妹妹'] + counts['黛玉']
counts['賈寶玉'] = counts['寶玉'] +counts['賈寶玉']
for word in excludes:
del counts[word]
# 4.排序 [(), ()]
items = list(counts.items())
print(items)
def sort_by_count(x):
return x[1]
items.sort(key=sort_by_count, reverse=True)
li = []
for i in range(10):
# 序列解包
role, count = items[i]
print(role, count)
# _ 是告訴看代碼的人遭笋,循環(huán)里面不需要使用臨時變量
for _ in range(count):
li.append(role)
# 5得出結(jié)論
text = ' '.join(li)
WordCloud(
font_path='msyh.ttc',
background_color='black',
width=800,
height=600,
# 相鄰兩個重復詞之間的匹配
collocations=False
).generate(text).to_file('top10.png')
紅樓夢人物分析
4.2 繪制三國top10人物餅圖
#繪制三國人物TOP10餅圖
import jieba
from matplotlib import pyplot as plt
plt.rcParams["font.sans-serif"] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 1.讀取小說內(nèi)容
with open('./novel/threekingdom.txt', 'r', encoding='utf-8') as f:
words = f.read()
counts = {} # {‘曹操’:234坝冕,‘回寨’:56}
excludes = {"將軍", "卻說", "丞相", "二人", "不可", "荊州", "不能", "如此", "商議",
"如何", "主公", "軍士", "軍馬", "左右", "次日", "引兵", "大喜", "天下",
"東吳", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人馬", "不知",
"孔明曰","玄德曰","劉備","云長"}
# 2. 分詞
words_list = jieba.lcut(words)
# print(words_list)
for word in words_list:
if len(word) <= 1:
continue
else:
# 更新字典中的值
# counts[word] = 取出字典中原來鍵對應的值 + 1
# counts[word] = counts[word] + 1 # counts[word]如果沒有就要報錯
# 字典。get(k) 如果字典中沒有這個鍵 返回 NONE
counts[word] = counts.get(word, 0) + 1
print(len(counts))
# 3. 詞語過濾,刪除無關詞瓦呼,重復詞
counts['孔明'] = counts['孔明'] + counts['孔明曰']
counts['玄德'] = counts['玄德'] + counts['玄德曰'] +counts['劉備']
counts['關公'] = counts['關公'] +counts['云長']
for word in excludes:
del counts[word]
# 4.排序 [(), ()]
items = list(counts.items())
print(items)
def sort_by_count(x):
return x[1]
items.sort(key=sort_by_count, reverse=True)
counthtml=[]
sanguo=[]
li = [] # ['孔明'喂窟, 孔明, 孔明央串,孔明...., '曹操'磨澡。。质和。稳摄。。]
for i in range(10):
# 序列解包
role, count = items[i]
print(role, count)
counthtml.append(count)
sanguo.append(role)
#5.繪圖
plt.pie(counthtml,shadow=True, labels=sanguo, autopct = '%1.1f%%')
plt.legend(loc=2)
plt.axis('equal')
plt.show()
三國演義人物餅圖