在網路上看到不錯的一篇文章 , 是用R寫的,最近剛好在玩Python,所以,也利用這文章的內容,重新用Python來重寫一次,在重寫的過程中,深深有感,R對於數據分析處理比Pyrhon來的簡潔多。當然,最終目的一樣都可以做出來就是了。
一、交易資料樣本下載
https://archive.ics.uci.edu/ml/datasets/online+retail
二、程式碼
import numpy as np import pandas as pd import holoviews as hv from holoviews import opts import matplotlib.pyplot as plt from sklearn.cluster import KMeans # %% 將範列取進來 df0 = pd.read_excel(R'Online Retail.xlsx',sheet_name=0) # %% df0.head()
# %% 取出自己要的 df0 = df0[['CustomerID', 'InvoiceDate','Quantity','UnitPrice']]
# %% 是否有數量為負的 df0[(df0['Quantity'] < 0
# %% 是否有價錢為負的 df0[(df0['UnitPrice'] < 0
# %% 只需要數量跟單價都大於 0 的資料 df0.query('Quantity > 0 and UnitPrice >0', inplace=True) # %% 找出並刪除空值,因為,我們不分析空值 df0.isna().sum()
# %% 刪除 df0.dropna(inplace=True) # %% 增加總合回去 df0['amount'] = df0.Quantity*df0.UnitPrice
# %% 取出這次要用的欄位,及重新取一個名子(不更換也可以,只是為了比照sample方便) df0= df0[['CustomerID', 'InvoiceDate','amount']] df0.columns = ["cid", "date", "amount"] df0['years'] = df0['date'].dt.year # %% 看一下欄位型態 df0.dtypes
# %% 將會員ID改為字串(非數字) df0.cid = df0.cid.astype('object')
# %% 依年分查看筆數 fig,axes = plt.subplots() df0['years'].plot.hist(grid=True, bins=2, rwidth=1, color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes) axes.set_title('No. Transaction by Year',fontweight ="bold", fontsize = 16) #設定直方圖的標題 axes.set_xlabel('Years') # 設定直方圖橫座標軸的標籤 axes.set_ylabel('Frequency') axes.grid(axis='x', alpha=0.80) axes.set_facecolor('#d8dcd6') plt.savefig('p-year.png') # 儲存圖片
# %% 增加一筆,每筆資料,離今天有幾天了, 因為,資料是拿sample,所以,日期會太遠,days 還是直接使用該交易資料的最大一天好了 #df0['days'] =(pd.to_datetime(datetime.date.today(),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days # %% df0['days'] =(pd.to_datetime(max(df0['date']),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days
# %% Df 為該會員交易日期的最後一天 a = df0.groupby("cid").agg( recent = pd.NamedAgg(column="days", aggfunc="min"), freq = pd.NamedAgg(column="date", aggfunc="nunique"), money = pd.NamedAgg(column="amount", aggfunc="mean"), senior=pd.NamedAgg(column="days", aggfunc="max"), since =pd.NamedAgg(column="date", aggfunc="min"), Df =pd.NamedAgg(column="date", aggfunc="max") ) a.reset_index(inplace=True) # %% 制作一個 DD 欄位,做為日期差幾天 a['DD'] =(pd.to_datetime(a['Df'],format='%Y/%m/%d') - pd.to_datetime(a['since'],format='%Y/%m/%d')).dt.days # %% a['BB'] = 0 # %% 計算每個人的平均購買週期 for indexs in a.index: grade_val = (a.loc[indexs, 'freq']) if grade_val > 1: a.loc[indexs, 'BB'] = a.loc[indexs, 'DD'] / (a.loc[indexs, 'freq']-1)
# %% 最近購買日 fig,axes = plt.subplots(2,2) a['recent'].hist(grid=True, bins=20, rwidth=1, color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[0,0]) axes[0,0].set_title('recent',fontweight ="bold", fontsize = 11) #設定直方圖的標題 axes[0,0].set_xlabel('days') # 設定直方圖橫座標軸的標籤 axes[0,0].set_ylabel('Member') axes[0,0].grid(axis='x') axes[0,0].set_facecolor('#d8dcd6') #plt.savefig('D:\JJ_desktop\python\p1.png') # 儲存圖片 # 將大於15次的購買次數更新為 15 統一最大值 for indexs in a.index: grade_val = (a.loc[indexs, 'freq']) if grade_val > 15: a.loc[indexs, 'freq'] = 15 # 顧客在限定時間內的購買次數。 a['freq'].hist(grid=True, bins=10, rwidth=1, color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[0,1]) axes[0,1].set_title('frequency',fontweight ="bold", fontsize = 11) #設定直方圖的標題 axes[0,1].set_xlabel('freq') # 設定直方圖橫座標軸的標籤 axes[0,1].set_ylabel('Member') axes[0,1].grid(axis='x') axes[0,1].set_facecolor('#d8dcd6') # 首次購買日,看那一陣子客人最多 a['senior'].hist(grid=True, bins=10, rwidth=1, color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[1,0]) axes[1,0].set_title('seniority',fontweight ="bold", fontsize = 11) #設定直方圖的標題 axes[1,0].set_xlabel('freq') # 設定直方圖橫座標軸的標籤 axes[1,0].set_ylabel('Member') axes[1,0].grid(axis='x') axes[1,0].set_facecolor('#d8dcd6') np.log10(a['money']).plot.hist(grid=True, bins=10, rwidth=1, color='#607c8e',edgecolor='#000000',histtype='bar',ax=axes[1,1]) axes[1,1].set_title('money(log)',fontweight ="bold", fontsize = 11) #設定直方圖的標題 axes[1,1].set_xlabel('money') # 設定直方圖橫座標軸的標籤 axes[1,1].set_ylabel('Member') axes[1,1].grid(axis='x') axes[1,1].set_facecolor('#d8dcd6') plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.6, hspace=0.6) plt.savefig(R'p1.png') # 儲存圖片
# %% kmeans = KMeans(n_clusters=10) # 指定聚類數量 kmeans.fit(a[['recent','freq','money','senior']]) # 執行聚類 result_kmeans = pd.DataFrame(kmeans.labels_, columns=['type']) # %% result = a.join(result_kmeans) # 將聚類結果與資料合併 result = result.groupby("type").agg( recent = pd.NamedAgg(column="recent", aggfunc="mean"), freq = pd.NamedAgg(column="freq", aggfunc="mean"), money = pd.NamedAgg(column="money", aggfunc="mean"), size=pd.NamedAgg(column="cid", aggfunc="nunique")) result.reset_index(inplace=True) result['freq'] = np.log10(result['freq']) result['money'] = np.log10(result['money']) X=result['freq'].to_numpy() Y=result['money'].to_numpy() S=result['size'].to_numpy() # %% annotations=result['size'].to_numpy() plt.figure(figsize=(8,6)) plt.scatter(X,Y,s=S,c=S, alpha=0.5, cmap='viridis') c = plt.colorbar() c.ax.set_title('Recency') plt.clim() plt.xlabel("Average Transaction Amount (log)") plt.ylabel("Frequency (log)") plt.suptitle("(bubble_size:revenue_contribution; text:group_size)", y=0.05, fontsize=10) plt.title("Customer Segements", fontsize=18) ax = fig.add_subplot(1, 1, 1) for i, label in enumerate(annotations): plt.text(X[i], Y[i],label) plt.savefig('p5.png') # 儲存圖片
# %% 計算會員的平均購買週期 k = np.mean(a[(a['freq'] >1)]['BB']) # %% 建立規則分群的function def get_NRS(row): rx = row['recent'] fx= row['freq'] mx= row['money'] sx = row['senior'] K=k returnVal = '' if sx < 2*K: if fx*mx > 50: returnVal = 'N2' else: returnVal = 'N1' else: if rx < 2*K: if sx/fx < 0.75*K: returnVal = 'R2' else: returnVal = 'R1' else: if rx < 3*K: returnVal = 'S1' else: if rx < 4*K: returnVal = 'S2' else: returnVal = 'S3' return returnVal
# %% 每一年的days是不同的,這裏處理每一年的 days df0[(df0['date'].dt.year <=2010)]['days'] =(pd.to_datetime('2010/1/1',format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days Y01 = df0[(df0['date'].dt.year <=2010)].groupby(['cid']).agg( recent = pd.NamedAgg(column="days", aggfunc="min"), freq = pd.NamedAgg(column="date", aggfunc="nunique"), money = pd.NamedAgg(column="amount", aggfunc="mean"), senior=pd.NamedAgg(column="days", aggfunc="max"), since =pd.NamedAgg(column="date", aggfunc="min"), Df =pd.NamedAgg(column="date", aggfunc="max") ) Y01.reset_index(inplace=True) Y01['recent'] =(pd.to_datetime('2010/12/31',format='%Y/%m/%d')-pd.to_datetime(Y01['Df'],format='%Y/%m/%d')).dt.days Y01['senior'] =(pd.to_datetime('2010/12/31',format='%Y/%m/%d')-pd.to_datetime(Y01['since'],format='%Y/%m/%d')).dt.days Y01['NRS'] = Y01.apply(get_NRS, axis=1) #df0['days'] =(pd.to_datetime(max(df0['date']),format='%Y/%m/%d') - pd.to_datetime(df0['date'],format='%Y/%m/%d')).dt.days Y02 = df0[(df0['date'].dt.year <=2011)].groupby(['cid']).agg( recent = pd.NamedAgg(column="days", aggfunc="min"), freq = pd.NamedAgg(column="date", aggfunc="nunique"), money = pd.NamedAgg(column="amount", aggfunc="mean"), senior=pd.NamedAgg(column="days", aggfunc="max"), since =pd.NamedAgg(column="date", aggfunc="min"), Df =pd.NamedAgg(column="date", aggfunc="max") ) Y02.reset_index(inplace=True) Y02['recent'] =(pd.to_datetime('2011/12/31',format='%Y/%m/%d')-pd.to_datetime(Y02['Df'],format='%Y/%m/%d')).dt.days Y02['senior'] =(pd.to_datetime('2011/12/31',format='%Y/%m/%d')-pd.to_datetime(Y02['since'],format='%Y/%m/%d')).dt.days Y02['NRS'] = Y02.apply(get_NRS, axis=1)
# %% left = np.array([1, 2]) N1 = np.array([Y01[Y01['NRS'] =='N1']["cid"].count(),Y02[Y02['NRS'] =='N1']["cid"].count()]) N2 = np.array([Y01[Y01['NRS'] =='N2']["cid"].count(),Y02[Y02['NRS'] =='N2']["cid"].count()]) R1 = np.array([Y01[Y01['NRS'] =='R1']["cid"].count(),Y02[Y02['NRS'] =='R1']["cid"].count()]) R2 = np.array([Y01[Y01['NRS'] =='R2']["cid"].count(),Y02[Y02['NRS'] =='R2']["cid"].count()]) S1 = np.array([Y01[Y01['NRS'] =='S1']["cid"].count(),Y02[Y02['NRS'] =='S1']["cid"].count()]) S2 = np.array([Y01[Y01['NRS'] =='S2']["cid"].count(),Y02[Y02['NRS'] =='S2']["cid"].count()]) S3 = np.array([Y01[Y01['NRS'] =='S3']["cid"].count(),Y02[Y02['NRS'] =='S3']["cid"].count()]) df = pd.DataFrame( {'Year':['2010', '2011'],'N1': N1, 'N2': N2, 'R1': R1, 'R2': R2, 'S1': S1, 'S2': S2, 'S3': S3} ) # %% ax = df.plot.bar(x="Year",stacked=True) ax.set_ylabel('Members') ax.set_xlabel('Year') plt.tight_layout() plt.savefig('p6.png') # 儲存圖片#N1 新客 #N2 新潛力顧客 #R1 主力客 #R2 核心顧客 #S1 磕睡客 #S2 半睡顧客 #S3 沉睡客戶
我們可以看出來,每一年會員的變化,如果,有更多年,我們可以看出,會員每年的變化,更進一步做出會員流動模型。運用sankey,看會員流動模很有趣,且,也可以即時運用各時間切法,知道,原來動要的客人,什麼時候變心了。
因為,是參考別人的文章寫出來的,很多圖表解讀就沒有寫進來,大家可以去出處看囉。
三、參考資料