導(dǎo)入庫(kù)
import pandas as pd
import numpy as np
pd.set_option("display.max_columns",33)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix
import warnings
warnings.filterwarnings("ignore")
數(shù)據(jù)基本信息
df = pd.read_excel("Data_Train.xlsx")
df.head()
df.shape
df.isnull().sum()
df.dtypes
columns = df.columns.tolist()
columns
具體字段的中文含義:
Airline:不同類型的航空公司
Date_of_Journey:旅客的旅行開始日期
Source:旅客出發(fā)地
Destination:旅客目的地
Route:航班路線
Dep_Time:出發(fā)時(shí)間
Arrival_Time:抵達(dá)時(shí)間
Duration:持續(xù)時(shí)間亲怠;指的是航班完成從出發(fā)到目的地的旅程的整個(gè)時(shí)間
Total_Stops:總共停留地
Additional_Info:其他信息欢搜,比如:食物、設(shè)備信息等
Price:整個(gè)旅程的航班票價(jià)
df.info()
df.describe()
缺失值處理
import missingno as mso
mso.bar(df,color="blue")
plt.show()
# 缺失值刪除
df.dropna(inplace=True)
df.isnull().sum()
時(shí)間相關(guān)字段處理
# 時(shí)間處理
# 通過(guò)pd.to_datetime()直接將字符型的數(shù)據(jù)轉(zhuǎn)成時(shí)間類型的數(shù)據(jù)
# 通過(guò)dt.day或者df.month 直接獲取天或者月的信息
def change_to_datetime(col):
df[col] = pd.to_datetime(df[col])
for col in ["Date_of_Journey","Dep_Time","Arrival_Time"]:
change_to_datetime(col)
df.dtypes
# 提取天和月
df["day"] = df["Date_of_Journey"].dt.day
df["month"] = df["Date_of_Journey"].dt.month
df.head()
df.drop("Date_of_Journey",axis=1,inplace=True)
# 起飛時(shí)間和抵達(dá)時(shí)間處理
def extract_hour(data,col):
data[col+ "_hour"] = data[col].dt.hour
def extract_minute(data,col):
data[col+ "_minute"] = data[col].dt.minute
def drop_col(data,col):
data.drop(col,axis=1,inplace=True)
extract_hour(df,"Dep_Time")
extract_minute(df,"Dep_Time")
drop_col(df,"Dep_Time")
extract_hour(df,"Arrival_Time")
extract_minute(df,"Arrival_Time")
drop_col(df,"Arrival_Time")
df.head()
# 航班持續(xù)時(shí)間
# 1蜻韭、將持續(xù)時(shí)間規(guī)范化處理,統(tǒng)一變成0h 1m
# duration = list(df["Duration"])
# for i in range(len(duration)):
# if len(duration[i].split(' ')) == 2:
# pass
# else:
# if 'h' in duration[i]:
# duration[i] = duration[i] + ' 0m'
# else:
# duration[i] = '0h ' + duration[i]
def change_duration(x):
if "h" in x and "m" in x:
return x
else:
if "h" in x:
return x + " 0m"
else:
return "0h " + x
df["Duration"] = df["Duration"].apply(change_duration)
df.head()
# 2滓鸠、從Duration字段中提取小時(shí)和分鐘
df1 = df["Duration"].str.extract(r'(?P<dur_hour>\d+)h (?P<dur_minute>\d+)m')
df1.head()
df = df.join(df1)
df.head()
df.drop("Duration",inplace=True,axis=1)
# 3、字段類型轉(zhuǎn)化:查看dur_hour和dur_minute的字段類型變化
df.dtypes
df["dur_hour"] = df["dur_hour"].astype(int)
df["dur_minute"] = df["dur_minute"].astype(int)
df.dtypes
字段編碼
# 1、針對(duì)字符型的字段
column = [column for column in df.columns if df[column].dtype == "object"]
column
# 2诀黍、數(shù)值型(連續(xù)型)字段
continuous_col = [column for column in df.columns if df[column].dtype != "object"]
continuous_col
2種編碼技術(shù)
標(biāo)稱數(shù)據(jù):沒有任何順序,使用獨(dú)熱編碼oneot encoding
有序數(shù)據(jù):存在一定的順序仗处,使用類型編碼labelEncoder
# 生成標(biāo)稱型字段組成的數(shù)據(jù)
categorical = df[column]
categorical.head()
不同字段編碼處理
# 航空公司-Airline
# 1蔗草、不同航空公司的數(shù)量統(tǒng)計(jì):
airline = categorical["Airline"].value_counts().reset_index()
airline
# 2、查看航空公司與價(jià)格關(guān)系
plt.figure(figsize=(15,8))
sns.boxplot(x="Airline",y="Price",data=df.sort_values("Price",ascending=False))
plt.show()
Jet Airways Business公司的機(jī)票價(jià)格是最高的
其他公司的價(jià)格中位數(shù)是比較接近的
# 3疆柔、實(shí)現(xiàn)獨(dú)熱編碼
Airline = pd.get_dummies(categorical["Airline"],drop_first=True)
Airline.head()
# 停留地-Total_Stops
# 1咒精、和價(jià)格的關(guān)系
plt.figure(figsize=(15,8))
sns.boxplot(x="Total_Stops",y="Price",data=df.sort_values("Price",ascending=False))
plt.show()
# 2、實(shí)施硬編碼旷档;區(qū)別于航空公司的獨(dú)熱編碼
dict_stops = {"non-stop":0, "1 stop":1, "2 stops":2, "3 stops":3, "4 stops":4}
categorical["Total_Stops"] = categorical["Total_Stops"].map(dict_stops)
categorical.head()
# 出發(fā)地source
# 出發(fā)地和價(jià)格的關(guān)系:
plt.figure(figsize=(18,12))
sns.catplot(x="Source",y="Price",data=df.sort_values("Price",ascending=False),kind="boxen")
plt.show()
# 獨(dú)熱編碼的過(guò)程:
source = pd.get_dummies(categorical["Source"],drop_first=True)
source.head()
# 目的地-destination
# 目的地和價(jià)格的關(guān)系
plt.figure(figsize=(18, 12))
sns.boxplot(x="Destination",
y="Price",
data=df.sort_values("Price", ascending=False))
plt.show()
# 獨(dú)熱編碼的實(shí)現(xiàn)
destination = pd.get_dummies(categorical["Destination"], drop_first=True)
destination.head()
# 路線Route
# 1.不同路線的數(shù)量統(tǒng)計(jì)
categorical["Route"].value_counts()
# 2.路線名稱提取
# 從上面結(jié)果看出來(lái)最長(zhǎng)的路線中有5個(gè)地名模叙,我們一次性提取
# 沒有出現(xiàn)的數(shù)據(jù)則用NaN來(lái)表示:
categorical["Route1"] = categorical["Route"].str.split("→").str[0]
categorical["Route2"] = categorical["Route"].str.split("→").str[1]
categorical["Route3"] = categorical["Route"].str.split("→").str[2]
categorical["Route4"] = categorical["Route"].str.split("→").str[3]
categorical["Route5"] = categorical["Route"].str.split("→").str[4]
categorical.head()
# 3.缺失值字段
categorical.drop("Route", axis=1, inplace=True)
categorical.isnull().sum()
for i in ["Route3", "Route4", "Route5"]:
categorical[i].fillna("None", inplace=True)
# 4.類型編碼LabelEncoder
from sklearn import preprocessing
le =preprocessing.LabelEncoder()
for i in ["Route1", "Route2", "Route3", "Route4", "Route5"]:
categorical[i] = le.fit_transform(categorical[i])
categorical.head()
# 抵達(dá)時(shí)間/小時(shí)-Arrival_Time_hour
# 抵達(dá)目的地時(shí)間和價(jià)格的關(guān)系
df.plot.hexbin(x="Arrival_Time_hour", y="Price", gridsize=15)
plt.show()
建模數(shù)據(jù)
# 刪除無(wú)效字段
# 生成的全部字段信息
categorical.columns
# 將原始的無(wú)效字段直接刪除
drop_col(categorical, "Airline")
drop_col(categorical, "Source")
drop_col(categorical, "Destination")
drop_col(categorical, "Additional_Info")
# 最終數(shù)據(jù)
final_df = pd.concat([categorical, Airline, source, destination, df[continuous_col]], axis=1)
final_df.head()
# 離群點(diǎn)檢測(cè)
# 對(duì)上面生成的最終數(shù)據(jù)進(jìn)行離群點(diǎn)檢測(cè)
def plot(data, col):
fig, (ax1, ax2) = plt.subplots(2, 1)
sns.distplot(data[col], ax=ax1)
sns.boxplot(data[col], ax=ax2)
plot(final_df, "Price")
# 對(duì)離群點(diǎn)填充均值,查看填充后的效果
final_df["Price"] = np.where(final_df["Price"]>=40000,
final_df["Price"].median(),
final_df["Price"])
plot(final_df, "Price")
# 數(shù)據(jù)切分
X = final_df.drop("Price", axis=1)
y = final_df["Price"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=123)
特征選擇
from sklearn.feature_selection import mutual_info_classif
imp = pd.DataFrame(mutual_info_classif(X ,y), index=X.columns)
imp.columns = ["importance"]
imp.sort_values(by="importance", ascending=False)
評(píng)價(jià)指標(biāo)
# r2_score(重點(diǎn)關(guān)注), mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def predict(ml_model):
print("Model is: ", ml_model)
model = ml_model.fit(X_train, y_train)
print("Training score: ", model.score(X_train, y_train))
predictions = model.predict(X_test)
print("Predictions: ", predictions)
print("----------")
r2score = r2_score(y_test, predictions)
print("r2 score is: ", r2score)
print("MAE:{}", mean_absolute_error(y_test, predictions))
print("MSE:{}", mean_squared_error(y_test, predictions))
print("RMSE:{}", np.sqrt(mean_squared_error(y_test, predictions)))
sns.distplot(y_test - predictions)
建模
# 導(dǎo)入多種模型
# 邏輯回歸
from sklearn.linear_model import LogisticRegression
# K近鄰回歸
from sklearn.neighbors import KNeighborsRegressor
# 決策樹回歸
from sklearn.tree import DecisionTreeRegressor
# 支持向量機(jī)回歸
from sklearn.svm import SVR
# 梯度提升回歸鞋屈,隨機(jī)森林回歸
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
# 隨機(jī)森林回歸樹
predict(RandomForestRegressor())
# 邏輯回歸
predict(LogisticRegression())
# K近鄰回歸
predict(KNeighborsRegressor())
# 決策樹回歸
predict(DecisionTreeRegressor())
# 支持向量機(jī)回歸
predict(SVR())
# 梯度提升回歸
predict(GradientBoostingRegressor())
模型調(diào)優(yōu)
# 調(diào)優(yōu)尋參
# 采用隨機(jī)搜索調(diào)優(yōu)
from sklearn.model_selection import RandomizedSearchCV
random_grid = {
"n_estimators":[100, 120, 150, 180, 200, 220],
"max_features":["auto", "sqrt"],
"max_depth":[5, 10, 15, 20]
}
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, cv=3, verbose=2, n_jobs=-1)
rf_random.fit(X_train, y_train)
rf_random.best_params_
# 調(diào)優(yōu)后結(jié)果
prediction = rf_random.predict(X_test)
sns.distplot(y_test - prediction)
r2_score(y_test, prediction)
兩種常見求解r2方式
# 利用python間接求解
from sklearn.metrics import mean_squared_error
y_test = [1, 2, 3]
y_pred = [1.3, 2.1, 3.5]
1 - mean_squared_error(y_test, y_pred)/np.var(y_test)
# sklearn直接求解
from sklearn.metrics import r2_score
y_test = [1, 2, 3]
y_pred = [1.3, 2.1, 3.5]
r2_score(y_test, y_pred)
來(lái)源:尤而小屋