一山析、項目背景
Adventure Works Cycle是國內(nèi)一家生產(chǎn)和銷售自行車及和相關(guān)配件的制造公司堰燎。利用每日商品銷售及相關(guān)客戶信息數(shù)據(jù),獲取商品銷售趨勢笋轨、地域分布情況和用戶畫像秆剪,幫助運營人員自主和實時分析。
二爵政、數(shù)據(jù)處理
1.數(shù)據(jù)結(jié)構(gòu)
數(shù)據(jù)結(jié)構(gòu).png
2.需求確認
- 1.全年自行車整體銷售表現(xiàn)(2020年)
- 2.上月自行車地域銷售表現(xiàn)(2020年12月)
- 3.上月自行車產(chǎn)品銷售表現(xiàn)
- 4.用戶行為分析
- 5.上月熱品銷售分析
3.數(shù)據(jù)處理
##########1.導(dǎo)包,連接數(shù)據(jù)庫讀取數(shù)據(jù)######
#導(dǎo)包
import pandas as pd
import numpy as np
import pymysql
pymysql.install_as_MySQLdb()
from sqlalchemy import create_engine
#連接數(shù)據(jù)庫,讀取數(shù)據(jù)
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order=pd.read_sql_query('select * from dw_customer_order',con=engine)
#查看時間地區(qū)產(chǎn)品聚合表的情況
gather_customer_order.head()
輸出為:image.png
######2.執(zhí)行第一個需求,對全年自行車銷售整體情況進行數(shù)據(jù)處理以及分析######
#將create_date按月新增一列
gather_customer_order['year_month']=gather_customer_order['create_date'].apply(lambda x:x.strftime('%Y-%m'))
gather_customer_order.head()
輸出為:image.png
#查看時間產(chǎn)品地區(qū)聚合表的簡要信息
gather_customer_order.info()
輸出為:image.png
#
#查看產(chǎn)品類別有哪幾類
gather_customer_order.cplb_zw.unique()
輸出為:image.png
#篩選產(chǎn)品類別為自行車的信息
gather_customer_order_cycle=gather_customer_order[gather_customer_order['cplb_zw']=='自行車']
gather_customer_order_cycle.head()
輸出為:image.png
#根據(jù)月份查看自行車的銷售數(shù)量與銷售額
pt_overall_sale_performance=gather_customer_order_cycle.groupby('year_month').agg({'order_num':sum,'sum_amount':sum}).reset_index()
#查看整體銷售情況表的基本數(shù)據(jù)
pt_overall_sale_performance
輸出為:image.png
##添加兩列,分別是銷售量環(huán)比和銷售額環(huán)比
pt_overall_sale_performance['order_num_diff']=pt_overall_sale_performance.iloc[:,1:2].pct_change()
pt_overall_sale_performance['sum_amount_diff']=pt_overall_sale_performance.iloc[:,2:3].pct_change()
#替換na值為0
pt_overall_sale_performance_1_wt=pt_overall_sale_performance.fillna(0)
pt_overall_sale_performance_1_wt
輸出為:image.png
#將表存到數(shù)據(jù)庫
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
pt_overall_sale_performance_1_wt.to_sql('pt_overall_sale_performance_1_wt',con=engine,if_exists='replace',index=False)
######3.執(zhí)行第二項需求數(shù)據(jù)處理新荤。上月自行車地域銷售表現(xiàn)#####
#篩選出2020年11月和12月的數(shù)據(jù),按地域和時間進行分組
cycle_territory_11_12=gather_customer_order_cycle[gather_customer_order_cycle['year_month'].isin(['2020-11','2020-12'])]
cycle_territory_11_12_group=cycle_territory_11_12.groupby(['chinese_territory','year_month']).agg({'order_num':sum,'sum_amount':sum}).reset_index()
cycle_territory_11_12_group
輸出為:image.png
#計算每個地區(qū)的銷售量環(huán)比和銷售額環(huán)比
territory=list(cycle_territory_11_12_group['chinese_territory'].unique())
order_num_diff=pd.Series([])
sum_amount_diff=pd.Series([])
for i in territory:
a=cycle_territory_11_12_group.loc[cycle_territory_11_12_group['chinese_territory']==i]['order_num'].pct_change().fillna(0)
b=cycle_territory_11_12_group.loc[cycle_territory_11_12_group['chinese_territory']==i]['sum_amount'].pct_change().fillna(0)
order_num_diff=order_num_diff.append(a)
sum_amount_diff=sum_amount_diff.append(b)
cycle_territory_11_12_group['order_num_diff']=order_num_diff
cycle_territory_11_12_group['sum_amount_diff']=sum_amount_diff
cycle_territory_11_12_group
輸出為:image.png
###將地域情況表存入數(shù)據(jù)庫
engine=create_engine('mysql://frogdata05:Frogdata!1321@106.15.121.232:3306/datafrog05_adventure?charset=gbk')
cycle_territory_11_12_group.to_sql('pt_bicy_territory_2_wt',con=engine,if_exists='replace',index=False)
##查看2020年12月銷量排名前十的城市
city_top_10=gather_customer_order_cycle[gather_customer_order_cycle['year_month']=='2020-12'].groupby('chinese_city').agg({'order_num':sum}).sort_values(by='order_num',ascending=False).reset_index()
city_top_10=city_top_10.iloc[0:10,:]
city_top_10
輸出為:image.png
#查找銷量前十城市11.12月的銷售情況
city_top_10_order=cycle_territory_11_12[cycle_territory_11_12['chinese_city'].isin(city_top_10['chinese_city'])]
#按城市竖慧、日期分組
city_top_10_order_group=city_top_10_order.groupby(['chinese_city','year_month']).agg({'order_num':sum,'sum_amount':sum}).reset_index()
#計算按成績計算銷售量環(huán)比和銷售額環(huán)比
city=list(city_top_10_order_group['chinese_city'].unique())
order_num_diff=pd.Series([])
sum_amount_diff=pd.Series([])
for i in city:
a=city_top_10_order_group.loc[city_top_10_order_group['chinese_city']==i]['order_num'].pct_change().fillna(0)
b=city_top_10_order_group.loc[city_top_10_order_group['chinese_city']==i]['sum_amount'].pct_change().fillna(0)
order_num_diff=order_num_diff.append(a)
sum_amount_diff=sum_amount_diff.append(b)
city_top_10_order_group['order_num_diff']=order_num_diff
city_top_10_order_group['sum_amount_diff']=sum_amount_diff
city_top_10_order_group
輸出為:image.png
##將銷量城市top10環(huán)比表存入數(shù)據(jù)庫
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
city_top_10_order_group.to_sql('pt_bicy_city_3_wt.to_sql',con=engine,if_exists='replace',index=False)
######4.執(zhí)行第三項需求數(shù)據(jù)處理。上月自行車產(chǎn)品銷售表現(xiàn)#####
#求每個月自行車累計銷量
gather_customer_order_group_month=gather_customer_order_cycle.groupby('year_month').order_num.sum().reset_index()
#將累計銷量與自行車銷售信息表連接
order_num_proportion=pd.merge(gather_customer_order_cycle,gather_customer_order_group_month,on='year_month')
#計算自行車每月銷量占比(每輛自行車當月占比情況夕土,便于可視化輸出)
order_num_proportion['order_proportion']=order_num_proportion['order_num_x']/order_num_proportion['order_num_y']
#重命名
order_num_proportion=order_num_proportion.rename(columns={'order_num_y':'sum_month_order'})
輸出為:image.png
#將每月自行車銷售信息存入數(shù)據(jù)庫
engine=create_engine('mysql://root:******@1127.0.01:3306/adventure?charset=gbk')
order_num_proportion.to_sql('pt_bicycle_product_sales_month_4_wt',con=engine,if_exists='replace',index=False)
#####查看不同產(chǎn)品自行車表現(xiàn),對數(shù)據(jù)進行整理####
#查看自行車有哪些產(chǎn)品子類
gather_customer_order_cycle['cpzl_zw'].unique()
輸出為:image.png
####分別查看自行車各子類產(chǎn)品銷售表現(xiàn)####
##########公路自行車###########
gather_customer_order_road=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='公路自行車']
#查看公路自行車每月各產(chǎn)品銷售情況
gather_customer_order_road_month=gather_customer_order_road.groupby(['year_month','product_name']).order_num.sum().reset_index()
gather_customer_order_road_month['cpzl_zw']='公路自行車'
#查看公路自行車每月的總銷量
gather_customer_order_road_month_sum=gather_customer_order_road.groupby('year_month').order_num.sum().reset_index()
#將兩表合并
gather_customer_order_road_month=pd.merge(gather_customer_order_road_month,gather_customer_order_road_month_sum,on='year_month')
#########山地自行車#########
gather_customer_order_mountain=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='山地自行車']
##查看山地自行車每月每種產(chǎn)品銷售情況
gather_customer_order_mountain_month=gather_customer_order_mountain.groupby(['year_month','product_name']).order_num.sum().reset_index()
gather_customer_order_mountain_month['cpzl_zw']='山地自行車'
#查看山地自行車每月總銷售量
gather_customer_order_mountain_month_sum=gather_customer_order_mountain.groupby('year_month').order_num.sum().reset_index()
#合并兩表
gather_customer_order_mountain_month=pd.merge(gather_customer_order_mountain_month,gather_customer_order_mountain_month_sum,on='year_month')
#########旅游自行車##########
gather_customer_order_tour=gather_customer_order_cycle[gather_customer_order_cycle['cpzl_zw']=='旅游自行車']
#查看旅游自行車每月每種產(chǎn)品銷售量
gather_customer_order_tour_month=gather_customer_order_tour.groupby(['year_month','product_name']).order_num.sum().reset_index()
#查看旅游自行車每月總銷售量
gather_customer_order_tour_month_sum=gather_customer_order_tour.groupby('year_month').order_num.sum().reset_index()
#合并兩表
gather_customer_order_tour_month=pd.merge(gather_customer_order_tour_month,gather_customer_order_tour_month_sum,on='year_month')
gather_customer_order_tour_month['cpzl_zw']='旅游自行車'
#將公路、山地九昧、旅游自行車每月銷售信息表合并
gather_customer_order_month=pd.concat([gather_customer_order_mountain_month,gather_customer_order_road_month,gather_customer_order_tour_month])
gather_customer_order_month
輸出為:image.png
gather_customer_order_month['order_num_proportion']=gather_customer_order_month['order_num_x']/gather_customer_order_month['order_num_y']
#重命名
gather_customer_order_month=gather_customer_order_month.rename(columns={'order_num_y':'sum_order_month','order_num_x':'order_month_product'})
gather_customer_order_month.head()
輸出為:image.png
#存入數(shù)據(jù)庫
engine=create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order_month.to_sql('pt_bicycle_product_sales_order_month_4_wt',con=engine,if_exists='append',index=False)
##計算當月自行車產(chǎn)品環(huán)比
#篩選出11,12月數(shù)據(jù)
gather_customer_order_month_11_12=gather_customer_order_month[gather_customer_order_month['year_month'].isin(['2020-11','2020-12'])]
#按產(chǎn)品類別和日期排序
gather_customer_order_month_11_12=gather_customer_order_month_11_12.sort_values(by=['product_name','year_month'],ascending=True)
#計算自行車銷量環(huán)比
product_name_list=list(gather_customer_order_month_11_12.product_name.unique())
order_top_x=pd.Series([])
for i in product_name_list:
a=gather_customer_order_month_11_12.loc[gather_customer_order_month_11_12['product_name']==i]['order_month_product'].pct_change().fillna(0)
order_top_x=order_top_x.append(a)
gather_customer_order_month_11_12['order_num_diff']=order_top_x
#篩選出12月自行車數(shù)據(jù)
gather_customer_order_month_12=gather_customer_order_month_11_12[gather_customer_order_month_11_12['year_month']=='2020-12']
#計算2020年全年累計銷量
gather_customer_order_month_sum=gather_customer_order_month.groupby('product_name').order_month_product.sum().reset_index()
gather_customer_order_month_sum=gather_customer_order_month_sum.rename(columns={'order_month_product':'sum_order_2020'})
gather_customer_order_month_sum
#關(guān)聯(lián)累計銷量表與12月自行車環(huán)比表
gather_customer_order_month_12=pd.merge(gather_customer_order_month_12,gather_customer_order_month_sum,on='product_name')
gather_customer_order_month_12
輸出為:image.png
#將表保存到數(shù)據(jù)庫
engine=create_engine('mysql://root:F******@127.0.0.1:3306/adventure?charset=gbk')
gather_customer_order_month_12.to_sql('pt_bicycle_product_sales_order_month_12_5_wt',con=engine,if_exists='append',index=False)
######5.執(zhí)行第四項需求進行用戶行為數(shù)據(jù)處理######
###讀取數(shù)據(jù)
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure_ods?charset=gbk')
df_customer=pd.read_sql_query("select customer_key,birth_date,gender,marital_status from ods_customer where create_date<='2020-12-31'",con=engine)
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure_ods?charset=gbk')
df_sales_orders = pd.read_sql_query("select * from ods_sales_orders where create_date>='2020-12-1' and create_date<='2020-12-31'",con = engine)
#查看數(shù)據(jù)情況
df_customer.head()
df_sales_orders.head()
輸出為:image.png
image.png
#將兩表連接
sales_customer_order=pd.merge(df_customer,df_sales_orders,on='customer_key',how='inner')
sales_customer_order.head()
輸出:image.png
#提取出生年份
customer_birth_year = sales_customer_order.birth_date.str.split('-',expand=True).rename(columns={0:'birth_year'}).drop(labels=[1,2],axis=1)
#合并
sales_customer_order=pd.concat([sales_customer_order,customer_birth_year],axis=1)
#修改出生年為int數(shù)據(jù)類型
sales_customer_order['birth_year'] = sales_customer_order['birth_year'].astype('int')
#計算用戶年齡
sales_customer_order['customer_age'] = 2021 - sales_customer_order['birth_year']
#用戶年齡分層
#年齡分層1
age_level=pd.cut(sales_customer_order['customer_age'],[30,35,40,45,50,55,60,65],labels=['30-34','35-39','40-44','45-49','50-54','55-59','60-64'],right=False)
#新增'age_level'分層區(qū)間列
sales_customer_order['age_level'] =age_level
sales_customer_order.head()
輸出為:image.png
#篩選銷售訂單為自行車的訂單信息
df_customer_order_bycle = sales_customer_order.loc[sales_customer_order['cplb_zw'] == '自行車']
# 計算年齡比率(可用于各類占比計算)
df_customer_order_bycle['age_level_rate']=1/df_customer_order_bycle.customer_age.count()
#將年齡分為3個層次
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] <= 29),'age_level2'] = '<=29'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 30) & (df_customer_order_bycle['customer_age'] < 40),'age_level2'] = '30-39'
df_customer_order_bycle.loc[(df_customer_order_bycle['customer_age'] >= 40),'age_level2'] = '>=40'
df_customer_order_bycle.head()
輸出為:
image.png
# 求每個年齡段人數(shù)
age_level2_count = df_customer_order_bycle.groupby(by = 'age_level2').sales_order_key.count().reset_index()
age_level2_count
輸出為:image.png
#用戶性別占比
gender_count = df_customer_order_bycle.groupby(by = 'gender').cplb_zw.count().reset_index()
gender_count
image.png
#合并年齡段表毕匀,計算占比
df_customer_order_bycle = pd.merge(df_customer_order_bycle,age_level2_count,on = 'age_level2').rename(columns = {'sales_order_key_y':'age_level2_count'})
df_customer_order_bycle['age_level2_rate'] = 1/df_customer_order_bycle['age_level2_count']
#合并性別表铸鹰,計算占比
df_customer_order_bycle = pd.merge(df_customer_order_bycle,gender_count,on = 'gender').rename(columns = {'cplb_zw_y':'gender_count'})
df_customer_order_bycle['gender_rate'] = 1/df_customer_order_bycle['gender_count']
df_customer_order_bycle.head()
輸出為:image.png
#存入數(shù)據(jù)庫
engine = create_engine('mysql://root:******@127.0.0.1:3306/adventure?charset=gbk')
df_customer_order_bycle.to_sql('pt_user_behavior_20_wt',con = engine,if_exists='append', index=False)
######6.執(zhí)行第五項需求,熱品銷售分析######
#篩選12月數(shù)據(jù)
gather_customer_order_12 = gather_customer_order_cycle.loc[gather_customer_order_cycle['year_month'] == '2020-12']
gather_customer_order_12.head()
#計算產(chǎn)品銷售數(shù)量
#按照銷量降序皂岔,取TOP10產(chǎn)品
customer_order_12_top10 = gather_customer_order_12.groupby(by = 'product_name').order_num.count().reset_index().\
sort_values(by = 'order_num',ascending = False).head(10)
customer_order_12_top10
輸出為:image.png
#篩選top10產(chǎn)品的銷量與環(huán)比
#銷量top10
customer_order_month_11_12 = gather_customer_order_month_11_12[['year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
customer_order_month_11_12 = customer_order_month_11_12[customer_order_month_11_12['product_name'].\
isin(list(customer_order_12_top10['product_name']))]
customer_order_month_11_12['category'] = '本月TOP10銷量'
customer_order_month_12 = gather_customer_order_month_11_12.loc[gather_customer_order_month_11_12['year_month'] == '2020-12'].\
sort_values(by = 'order_num_diff',ascending = False).head(10)
customer_order_month_11_12 = gather_customer_order_month_11_12[['year_month','product_name','order_month_product','cpzl_zw','order_num_diff']]
#增速top10
customer_order_month_12_top10_seep = customer_order_month_11_12.loc[customer_order_month_11_12['product_name'].\
isin(list(customer_order_month_12['product_name']))]
customer_order_month_12_top10_seep['category'] = '本月TOP10增速'
#合并兩表
hot_products_12 = pd.concat([customer_order_month_11_12,customer_order_month_12_top10_seep],axis = 0)
hot_products_12
輸出:
image.png
#存入數(shù)據(jù)庫
engine = create_engine('mysql://frogdata05:Frogdata!1321@106.15.121.232:3306/datafrog05_adventure?charset=gbk')
datafrog=engine
hot_products_12.to_sql('pt_hot_products_wt',con = datafrog,if_exists='append', index=False)
三蹋笼、可視化輸出與報告輸出
1..png
2.png
3.png
4.png
5.png
6.png
7.png
8.png
9.png
10.png
11.png
12.png
13.png
14.png
15.png