在網路上看到不錯的一篇文章 , 是用R寫的,最近剛好在玩Python,所以,也利用這文章的內容,重新用Python來重寫一次,在重寫的過程中,深深有感,R對於數據分析處理比Pyrhon來的簡潔多。當然,最終目的一樣都可以做出來就是了。
一、交易資料樣本下載
https://archive.ics.uci.edu/ml/datasets/online+retail
二、程式碼
import numpy as np
import pandas as pd
import holoviews as hv
from holoviews import opts
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# %% 將範列取進來
df0 = pd.read_excel(R'Online Retail.xlsx',sheet_name=0)
# %%
df0.head()
# %% 取出自己要的
df0 = df0[['CustomerID', 'InvoiceDate','Quantity','UnitPrice']]
# %% 是否有數量為負的
df0[(df0['Quantity'] < 0
# %% 是否有價錢為負的
df0[(df0['UnitPrice'] < 0
# %% 只需要數量跟單價都大於 0 的資料
df0.query('Quantity > 0 and UnitPrice >0', inplace=True)
# %% 找出並刪除空值,因為,我們不分析空值
df0.isna().sum()
# %% 刪除
df0.dropna(inplace=True)
# %% 增加總合回去
df0['amount'] = df0.Quantity*df0.UnitPrice
# %% 取出這次要用的欄位,及重新取一個名子(不更換也可以,只是為了比照sample方便)
df0= df0[['CustomerID', 'InvoiceDate','amount']]
df0.columns = ["cid", "date", "amount"]
df0['years'] = df0['date'].dt.year
# %% 看一下欄位型態
df0.dtypes
# %% 將會員ID改為字串(非數字)
df0.cid = df0.cid.astype('object')
# %% 依年分查看筆數
fig,axes = plt.subplots()
df0['years'].plot.hist(grid=True, bins=2, rwidth=1,
color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes)
axes.set_title('No. Transaction by Year',fontweight ="bold", fontsize = 16) #設定直方圖的標題
axes.set_xlabel('Years') # 設定直方圖橫座標軸的標籤
axes.set_ylabel('Frequency')
axes.grid(axis='x', alpha=0.80)
axes.set_facecolor('#d8dcd6')
plt.savefig('p-year.png') # 儲存圖片
# %% 增加一筆,每筆資料,離今天有幾天了, 因為,資料是拿sample,所以,日期會太遠,days 還是直接使用該交易資料的最大一天好了
#df0['days'] =(pd.to_datetime(datetime.date.today(),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days
# %%
df0['days'] =(pd.to_datetime(max(df0['date']),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days
# %% Df 為該會員交易日期的最後一天
a = df0.groupby("cid").agg(
recent = pd.NamedAgg(column="days", aggfunc="min"),
freq = pd.NamedAgg(column="date", aggfunc="nunique"),
money = pd.NamedAgg(column="amount", aggfunc="mean"),
senior=pd.NamedAgg(column="days", aggfunc="max"),
since =pd.NamedAgg(column="date", aggfunc="min"),
Df =pd.NamedAgg(column="date", aggfunc="max")
)
a.reset_index(inplace=True)
# %% 制作一個 DD 欄位,做為日期差幾天
a['DD'] =(pd.to_datetime(a['Df'],format='%Y/%m/%d') - pd.to_datetime(a['since'],format='%Y/%m/%d')).dt.days
# %%
a['BB'] = 0
# %% 計算每個人的平均購買週期
for indexs in a.index:
grade_val = (a.loc[indexs, 'freq'])
if grade_val > 1:
a.loc[indexs, 'BB'] = a.loc[indexs, 'DD'] / (a.loc[indexs, 'freq']-1)
# %% 最近購買日
fig,axes = plt.subplots(2,2)
a['recent'].hist(grid=True, bins=20, rwidth=1,
color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[0,0])
axes[0,0].set_title('recent',fontweight ="bold", fontsize = 11) #設定直方圖的標題
axes[0,0].set_xlabel('days') # 設定直方圖橫座標軸的標籤
axes[0,0].set_ylabel('Member')
axes[0,0].grid(axis='x')
axes[0,0].set_facecolor('#d8dcd6')
#plt.savefig('D:\JJ_desktop\python\p1.png') # 儲存圖片
# 將大於15次的購買次數更新為 15 統一最大值
for indexs in a.index:
grade_val = (a.loc[indexs, 'freq'])
if grade_val > 15:
a.loc[indexs, 'freq'] = 15
# 顧客在限定時間內的購買次數。
a['freq'].hist(grid=True, bins=10, rwidth=1,
color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[0,1])
axes[0,1].set_title('frequency',fontweight ="bold", fontsize = 11) #設定直方圖的標題
axes[0,1].set_xlabel('freq') # 設定直方圖橫座標軸的標籤
axes[0,1].set_ylabel('Member')
axes[0,1].grid(axis='x')
axes[0,1].set_facecolor('#d8dcd6')
# 首次購買日,看那一陣子客人最多
a['senior'].hist(grid=True, bins=10, rwidth=1,
color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[1,0])
axes[1,0].set_title('seniority',fontweight ="bold", fontsize = 11) #設定直方圖的標題
axes[1,0].set_xlabel('freq') # 設定直方圖橫座標軸的標籤
axes[1,0].set_ylabel('Member')
axes[1,0].grid(axis='x')
axes[1,0].set_facecolor('#d8dcd6')
np.log10(a['money']).plot.hist(grid=True, bins=10, rwidth=1,
color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[1,1])
axes[1,1].set_title('money(log)',fontweight ="bold", fontsize = 11) #設定直方圖的標題
axes[1,1].set_xlabel('money') # 設定直方圖橫座標軸的標籤
axes[1,1].set_ylabel('Member')
axes[1,1].grid(axis='x')
axes[1,1].set_facecolor('#d8dcd6')
plt.subplots_adjust(left=0.125,
bottom=0.1,
right=0.9,
top=0.9,
wspace=0.6,
hspace=0.6)
plt.savefig(R'p1.png') # 儲存圖片
# %%
kmeans = KMeans(n_clusters=10) # 指定聚類數量
kmeans.fit(a[['recent','freq','money','senior']]) # 執行聚類
result_kmeans = pd.DataFrame(kmeans.labels_, columns=['type'])
# %%
result = a.join(result_kmeans) # 將聚類結果與資料合併
result = result.groupby("type").agg(
recent = pd.NamedAgg(column="recent", aggfunc="mean"),
freq = pd.NamedAgg(column="freq", aggfunc="mean"),
money = pd.NamedAgg(column="money", aggfunc="mean"),
size=pd.NamedAgg(column="cid", aggfunc="nunique"))
result.reset_index(inplace=True)
result['freq'] = np.log10(result['freq'])
result['money'] = np.log10(result['money'])
X=result['freq'].to_numpy()
Y=result['money'].to_numpy()
S=result['size'].to_numpy()
# %%
annotations=result['size'].to_numpy()
plt.figure(figsize=(8,6))
plt.scatter(X,Y,s=S,c=S, alpha=0.5, cmap='viridis')
c = plt.colorbar()
c.ax.set_title('Recency')
plt.clim()
plt.xlabel("Average Transaction Amount (log)")
plt.ylabel("Frequency (log)")
plt.suptitle("(bubble_size:revenue_contribution; text:group_size)", y=0.05, fontsize=10)
plt.title("Customer Segements", fontsize=18)
ax = fig.add_subplot(1, 1, 1)
for i, label in enumerate(annotations):
plt.text(X[i], Y[i],label)
plt.savefig('p5.png') # 儲存圖片
# %% 計算會員的平均購買週期
k = np.mean(a[(a['freq'] >1)]['BB'])
# %% 建立規則分群的function
def get_NRS(row):
rx = row['recent']
fx= row['freq']
mx= row['money']
sx = row['senior']
K=k
returnVal = ''
if sx < 2*K:
if fx*mx > 50:
returnVal = 'N2'
else:
returnVal = 'N1'
else:
if rx < 2*K:
if sx/fx < 0.75*K:
returnVal = 'R2'
else:
returnVal = 'R1'
else:
if rx < 3*K:
returnVal = 'S1'
else:
if rx < 4*K:
returnVal = 'S2'
else:
returnVal = 'S3'
return returnVal
# %% 每一年的days是不同的,這裏處理每一年的 days
df0[(df0['date'].dt.year <=2010)]['days'] =(pd.to_datetime('2010/1/1',format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days
Y01 = df0[(df0['date'].dt.year <=2010)].groupby(['cid']).agg(
recent = pd.NamedAgg(column="days", aggfunc="min"),
freq = pd.NamedAgg(column="date", aggfunc="nunique"),
money = pd.NamedAgg(column="amount", aggfunc="mean"),
senior=pd.NamedAgg(column="days", aggfunc="max"),
since =pd.NamedAgg(column="date", aggfunc="min"),
Df =pd.NamedAgg(column="date", aggfunc="max")
)
Y01.reset_index(inplace=True)
Y01['recent'] =(pd.to_datetime('2010/12/31',format='%Y/%m/%d')-pd.to_datetime(Y01['Df'],format='%Y/%m/%d')).dt.days
Y01['senior'] =(pd.to_datetime('2010/12/31',format='%Y/%m/%d')-pd.to_datetime(Y01['since'],format='%Y/%m/%d')).dt.days
Y01['NRS'] = Y01.apply(get_NRS, axis=1)
#df0['days'] =(pd.to_datetime(max(df0['date']),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days
Y02 = df0[(df0['date'].dt.year <=2011)].groupby(['cid']).agg(
recent = pd.NamedAgg(column="days", aggfunc="min"),
freq = pd.NamedAgg(column="date", aggfunc="nunique"),
money = pd.NamedAgg(column="amount", aggfunc="mean"),
senior=pd.NamedAgg(column="days", aggfunc="max"),
since =pd.NamedAgg(column="date", aggfunc="min"),
Df =pd.NamedAgg(column="date", aggfunc="max")
)
Y02.reset_index(inplace=True)
Y02['recent'] =(pd.to_datetime('2011/12/31',format='%Y/%m/%d')-pd.to_datetime(Y02['Df'],format='%Y/%m/%d')).dt.days
Y02['senior'] =(pd.to_datetime('2011/12/31',format='%Y/%m/%d')-pd.to_datetime(Y02['since'],format='%Y/%m/%d')).dt.days
Y02['NRS'] = Y02.apply(get_NRS, axis=1)
# %%
left = np.array([1, 2])
N1 = np.array([Y01[Y01['NRS'] =='N1']["cid"].count(),Y02[Y02['NRS'] =='N1']["cid"].count()])
N2 = np.array([Y01[Y01['NRS'] =='N2']["cid"].count(),Y02[Y02['NRS'] =='N2']["cid"].count()])
R1 = np.array([Y01[Y01['NRS'] =='R1']["cid"].count(),Y02[Y02['NRS'] =='R1']["cid"].count()])
R2 = np.array([Y01[Y01['NRS'] =='R2']["cid"].count(),Y02[Y02['NRS'] =='R2']["cid"].count()])
S1 = np.array([Y01[Y01['NRS'] =='S1']["cid"].count(),Y02[Y02['NRS'] =='S1']["cid"].count()])
S2 = np.array([Y01[Y01['NRS'] =='S2']["cid"].count(),Y02[Y02['NRS'] =='S2']["cid"].count()])
S3 = np.array([Y01[Y01['NRS'] =='S3']["cid"].count(),Y02[Y02['NRS'] =='S3']["cid"].count()])
df = pd.DataFrame( {'Year':['2010', '2011'],'N1': N1, 'N2': N2, 'R1': R1, 'R2': R2, 'S1': S1, 'S2': S2, 'S3': S3} )
# %%
ax = df.plot.bar(x="Year",stacked=True)
ax.set_ylabel('Members')
ax.set_xlabel('Year')
plt.tight_layout()
plt.savefig('p6.png') # 儲存圖片
#N1 新客
#N2 新潛力顧客
#R1 主力客
#R2 核心顧客
#S1 磕睡客
#S2 半睡顧客
#S3 沉睡客戶
我們可以看出來,每一年會員的變化,如果,有更多年,我們可以看出,會員每年的變化,更進一步做出會員流動模型。運用sankey,看會員流動模很有趣,且,也可以即時運用各時間切法,知道,原來動要的客人,什麼時候變心了。
因為,是參考別人的文章寫出來的,很多圖表解讀就沒有寫進來,大家可以去出處看囉。
三、參考資料
2、 顧客價值分析(Buy Till You Die) with Python
該文,是用Python寫的,是可以直接試哦,1 的後半段,我是用這文中的方式去做的。