導(dǎo)入庫
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import plotly_express as px
import plotly.graph_objects as go
數(shù)據(jù)基本信息
data = pd.read_csv("AppleStore.csv", index_col=0)
data.head()
image.png
# 整體大小
data.shape
image.png
# 缺失值
data.isnull().sum()
image.png
# 字段類型
data.dtypes
image.png
# 描述信息
data.describe()
image.png
APP信息統(tǒng)計
# 免費(fèi)的APP數(shù)量
sum(data.price==0)
image.png
# 價格超過50$的APP數(shù)量啤挎,價格大于50$即表示為:超貴
sum(data.price >= 50)
image.png
# 價格超過50$的比例
sum((data.price > 50) / len(data.price) * 100)
image.png
sum(data.price >= 50) / len(data) * 100
image.png
# 立群數(shù)據(jù)
# 價格超過50$的APP信息
outlier = data[data.price > 50][["track_name", "price", "prime_genre", "user_rating"]]
outlier
image.png
免費(fèi)APP
freeapps = data[data["price"] == 0]
freeapps.head()
image.png
正常區(qū)間的APP
# 取數(shù)
paidapps = data[(data["price"] > 0) & (data.price < 50)]
print("max_price:", max(paidapps.price))
print("min_price:", min(paidapps.price))
image.png
# 價格分布
plt.style.use("fivethirtyeight")
plt.figure(figsize=(12, 10))
# 繪制直方圖
plt.subplot(2, 1, 1)
plt.hist(paidapps.price, log=True)
# 標(biāo)題和label值
plt.title("Price distribution of apps (Log scale)")
plt.ylabel("Frequency Log scale")
plt.xlabel("Price Distributions in ($)")
# 繪制stripplot(分布散點(diǎn)圖)
plt.subplot(2, 1, 2)
plt.title("Visual Price distribution")
sns.stripplot(data=paidapps,
x="price",
jitter=True, #當(dāng)數(shù)據(jù)點(diǎn)重合較多時驻谆,用該參數(shù)做調(diào)整
size=6)
plt.show()
image.png
1.隨著價格的上漲卵凑,付費(fèi)應(yīng)用的數(shù)量呈現(xiàn)指數(shù)級的下降
2.很少應(yīng)用的價格超過30刀;因此胜臊,盡量保持價格在30以下
category對價格分布的影響
data.columns
image.png
# 種類及數(shù)目
data["prime_genre"].value_counts()
image.png
# 顯示前5個種類
yrange = [0, 25]
fsize = 15
plt.figure(figsize=(12, 10))
plt.subplot(5, 1, 1)
plt.xlim(yrange)
games = paidapps[paidapps["prime_genre"] == "Games"]
sns.stripplot(data=games,
x="price",
jitter=True,
size=6,
color="#eb5e66")
plt.title("Games", fontsize=fsize)
plt.xlabel("")
plt.subplot(5, 1, 2)
plt.xlim(yrange)
ent = paidapps[paidapps["prime_genre"] == "Entertainment"]
sns.stripplot(data=ent,
x="price",
jitter=True,
size=6,
color="#ff8300")
plt.title("Entertainment", fontsize=fsize)
plt.xlabel("")
plt.subplot(5, 1, 3)
plt.xlim(yrange)
edu = paidapps[paidapps.prime_genre == "Education"]
sns.stripplot(data=edu,
x="price",
jitter=True,
size=6,
color="#20B2AA")
plt.title("Education", fontsize=fsize)
plt.xlabel("")
plt.subplot(5, 1, 4)
plt.xlim(yrange)
pv = paidapps[paidapps.prime_genre == "Photo & Video"]
sns.stripplot(data=pv,
x="price",
jitter=True,
size=6,
color="#b84efd")
plt.title("Photo & Video", fontsize=fsize)
plt.xlabel("")
plt.subplot(5, 1, 5)
plt.xlim(yrange)
ut = paidapps[paidapps.prime_genre == "Utilities"]
sns.stripplot(data=pv,
x="price",
jitter=True,
size=6,
color="#084cfd")
plt.title("Utilities", fontsize=fsize)
plt.xlabel("")
image.png
1.Games游戲類的apps價格相對高且分布更廣勺卢,直到25美元
2.Entertainment娛樂類的apps價格相對較低
Paid apps Vs Free apps
# 付費(fèi)APP和免費(fèi)APPA之間的比較
# app的種類
categories = data["prime_genre"].value_counts()
categories
image.png
len(categories)
image.png
# 選擇前4個
s = categories.index[:4]
s
image.png
def categ(x):
if x in s:
return x
else:
return "Others"
data["broad_genre"] = data["prime_genre"].apply(categ)
data.head()
image.png
# 統(tǒng)計免費(fèi)和付費(fèi)APP下的種類數(shù)
# 免費(fèi)
data[data.price==0].broad_genre.value_counts()
image.png
# 免費(fèi)APP
free = data[data.price==0].broad_genre.value_counts().sort_index().to_frame()
free
image.png
# 付費(fèi)
paid = data[data.price > 0].broad_genre.value_counts().sort_index().to_frame()
paid
image.png
# 全部
total = data.broad_genre.value_counts().sort_index().to_frame()
total
image.png
# 將兩個數(shù)據(jù)合并起來
free.columns = ["free"]
paid.columns = ["paid"]
total.columns = ["total"]
free
image.png
# 統(tǒng)計量對比
dist = free.join(paid).join(total)
# 另一種寫法
dist = pd.concat([paid, free, total], axis=1)
dist
image.png
# 生成比例
dist["paid_per"] = dist.paid / dist.total * 100
dist["free_per"] = dist.free / dist.total * 100
dist
image.png
# 高亮顯示最大值
dist.style.highlight_max()
image.png
1.Games相關(guān)的APP是最多的,不管是paid還是free
2.從付費(fèi)占比來看象对,Education教育類型占比最大
3.從免費(fèi)占比來看黑忱,Entertainment娛樂類型的占比最大
付費(fèi)和免費(fèi)的占比
# 生成數(shù)據(jù)
# 分組對比付費(fèi)和免費(fèi)的占比
list_free = dist.free_per.tolist()
list_free
image.png
# 列表轉(zhuǎn)成元組
tuple_free = tuple(list_free)
tuple_paidapps = tuple(dist.paid_per.tolist())
# 柱狀圖
plt.figure(figsize=(12, 8))
N = 5
ind = np.arange(N)
width = 0.56 #兩個柱子間的寬度
p1 = plt.bar(ind, tuple_free, width, color="#45cea2")
p2 = plt.bar(ind, tuple_paidapps, width, bottom=tuple_free, color="#fdd400")
plt.xticks(ind, tuple(dist.index.tolist()))
plt.legend((p1[0], p2[0]), ("free", "paid"))
plt.show()
image.png
# 餅圖
pies = dist[["free_per", "paid_per"]]
pies.columns = ["free %", "paid %"]
pies
image.png
plt.figure(figsize=(15, 8))
pies.T.plot.pie(subplots=True, #顯示子圖
figsize=(20, 4), #大小
colors=["#45cea2", "#fad470"]) #顏色
plt.show()
image.png
1.在教育類的APP中,付費(fèi)paid的占比是很高的
2.相反的勒魔,在娛樂類的APP中甫煞,免費(fèi)free的占比是很高的
付費(fèi)APP真的足夠好嗎?
# 價格分類
data["category"] = data["price"].apply(lambda x: "Paid" if x > 0 else "Free")
data.head()
image.png
# 小提琴圖
plt.figure(figsize=(15, 8))
plt.style.use("fast")
plt.ylim([0, 5])
plt.title("Distribution of User ratings")
sns.violinplot(data=data,
y="user_rating",
x="broad_genre",
hue="category",
vertical=True, #垂直顯示
kde=False,
split=True, #同個類別的小提琴圖一起顯示
linewidth=2,
scale="count",
palette=["#fdd470", "#45cea2"])
plt.xlabel(" ")
plt.ylabel("Rating(0-5)")
plt.show()
image.png
1.在Education類的APP中冠绢,paid的占比是明顯高于free抚吠;其次是Photo & Video
2.Entertainment娛樂的APP,free占比高于paid弟胀;且整體的占比分布更為寬
plt.figure(figsize=(15, 8))
plt.style.use("fast")
plt.ylim([0, 5])
plt.title("Distribution of User ratings")
sns.violinplot(data=data,
y="user_rating",
x="broad_genre",
hue="category",
vertical=True, #垂直顯示
kde=False,
split=False, #同個類別的小提琴圖一起顯示
linewidth=2,
scale="count",
palette=["#fdd470", "#45cea2"])
plt.xlabel(" ")
plt.ylabel("Rating(0-5)")
plt.show()
image.png
size和price關(guān)系
sns.color_palette("husl", 8)
sns.set_style("whitegrid")
flatui = ["#9b59b6", "#3498db", "#95a5a6", "#e74c3c", "#34495e", "#2ecc71"]
data["MB"] = data.size_bytes.apply(lambda x: x/1048576)
# 挑選區(qū)間的數(shù)據(jù)
paidapps_regression = data[((data.price < 30) & (data.price > 0))]
sns.lmplot(data=paidapps_regression,
x="MB",
y="price",
height=4,
aspect=2,
col_wrap=2,
hue="broad_genre",
col="broad_genre",
fit_reg=False,
palette=sns.color_palette("husl", 5))
plt.show()
image.png
# 使用Plotly實(shí)現(xiàn)
px.scatter(paidapps_regression,
x="MB",
y="price",
color="broad_genre",
facet_col="broad_genre",
facet_col_wrap=2)
image.png
# APP分類:是否可根據(jù)paid和free來劃分
# 5種類型對比
# 設(shè)置顏色和大小
BlueOrangeWapang = ["#fc910d", "#fcb13e", "#239cd3", "#1674b1", "#ed6d50"]
plt.figure(figsize=(10, 10))
# 數(shù)據(jù)
label_names = data.broad_genre.value_counts().sort_index().index
size = data.broad_genre.value_counts().sort_index().tolist()
# 內(nèi)嵌空白圈
my_circle = plt.Circle((0, 0), 0.5, color="white")
# 圓
plt.pie(size, labels=label_names, colors=BlueOrangeWapang)
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.show()
image.png
# 使用Plotly實(shí)現(xiàn)
fig = px.pie(values=size,
names=label_names,
labels=label_names,
hole=0.5)
fig.update_traces(textposition="inside", textinfo="percent+label")
fig.show()
image.png
# 5種類型+是否付費(fèi)
f = pd.DataFrame(index=np.arange(0, 10, 2),
data=dist.free.values,
columns=["num"])
p = pd.DataFrame(index=np.arange(1, 11, 2),
data=dist.paid.values,
columns=["num"])
final = pd.concat([f, p], names=["labels"]).sort_index()
final
image.png
final.num.tolist()
image.png
plt.figure(figsize=(20, 20))
group_names = data.broad_genre.value_counts().sort_index().index
group_size = data.broad_genre.value_counts().sort_index().tolist()
h = ["Free", "Paid"]
subgroup_names = 5*h
sub = ["#45cea2", "#fdd470"]
subcolors = 5*sub
subgroup_size = final.num.tolist()
# 外層
fig, ax = plt.subplots()
ax.axis("equal")
mypie, _ = ax.pie(group_size, radius=2.5, labels=group_names, colors=BlueOrangeWapang)
plt.setp(mypie, width=1.2, edgecolor="white")
# 內(nèi)層
mypie2, _ = ax.pie(subgroup_size, radius=1.6, labels=subgroup_names, labeldistance=0.7, colors=subcolors)
plt.margins(0, 0)
plt.show()
image.png
# 使用Plotly實(shí)現(xiàn)
fig = px.sunburst(data,
path=["broad_genre", "category"],
values="MB")
fig.show()
image.png
來源:尤而小屋