import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.metrics import mean_squared_error as MSE from scipy.stats import norm from sklearn.svm import SVR from sklearn.inspection import permutation_importance from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error import xgboost as xg import shap import warnings warnings.filterwarnings("ignore") # Read the Joined data df = pd.read_csv("PosMacroSegmentTradeWalkSwitch.csv") #Feature engineering - Checking for similar private label SKUs df['similar private label SKU'] = '' for i in range(45): idx = df[df['cluster'] == i].index.tolist() if len(df.loc[idx, :][df['Manufacturer.full'] == 'PRIVATE LABEL']) > 0: df.loc[idx,'similar private label SKU'] = 1 else: df.loc[idx,'similar private label SKU'] = 0 # Filtering for ASDA Kelloggs df = df[(df['geoCodeDesc'] == 'Asda') & (df['BrnDedPrivLab.full'] == 'KELLOGGS')] df['Invesment/valueSales'] = df['Total Trade Investment'].astype('float') / df['valueSales'] # Filtering relevant features for modelling df = df[['prodDesc', 'unitSales', 'valueSales', 'unitPrice', 'target customers', 'SubSegment.full', 'majority customers', 'valueSales/SubSeg', 'unitSales/SubSeg', 'Total Trade Investment', 'Invesment/valueSales', 'WalkAway', 'KSwitchIndex', 'CompSwitchIndex','cluster','valueSales/ACV','Manufacturer.full', 'similar private label SKU', 'dateEnding_year','geoCodeDesc']] # Identifing product trends between 2021 and 2022 df['p1'] = np.where(df['dateEnding_year'] == 2021, df['valueSales'], 0) df['p2'] = np.where(df['dateEnding_year'] == 2022, df['valueSales'], 0) p1 = df.groupby('prodDesc').agg({'p1':'sum','p2':'sum'}).reset_index() # Calculating discount using the 5 week rolling percentile dis = df.groupby(['prodDesc','geoCodeDesc']).rolling(5)['unitPrice'].quantile(0.9) for i in dis.keys(): if pd.isna(dis[i[0], i[1], i[2]]): try: dis[i[0], i[1], i[2]] = dis[i[0], i[1], 4] except: dis[i[0], i[1], i[2]] = df[(df['prodDesc'] == i[0]) & (df['geoCodeDesc'] == i[1])]['unitPrice'].values[0] dis = dis.reset_index() dis = dis.rename(columns={'unitPrice':'90PUnitPrice'}) df['90PUnitPrice'] = dis['90PUnitPrice'] df['discount'] = np.where(df['90PUnitPrice'] - df['unitPrice'] > 0, 1, 0) df['noDiscountSales'] = np.where(df['discount'] == 0, df['valueSales'], 0) # Standardizing and factorizing data new_df = df.drop(columns=['prodDesc','SubSegment.full', 'majority customers','unitSales/SubSeg', 'dateEnding_year', 'p1', 'p2','90PUnitPrice','Manufacturer.full']) new_df.replace([np.inf, -np.inf], 0, inplace=True) new_df['Total Trade Investment'] = new_df['Total Trade Investment'].astype('float') standardized_new_df=(new_df.select_dtypes(include='number')-new_df.select_dtypes(include='number').mean())/(new_df.select_dtypes(include='number').std()) standardized_new_df.head() factorized_new_df = new_df.select_dtypes(exclude='number').apply(lambda x: pd.factorize(x)[0]) factorized_new_df new_df = pd.concat([standardized_new_df, factorized_new_df], axis=1) new_df = new_df.drop(columns=['Total Trade Investment','geoCodeDesc']) # Model Training # Model 1 - SVR inclusive of all features y = new_df['valueSales'] X = new_df.loc[:, new_df.columns != 'valueSales'] X = X.fillna(0) svr = SVR(kernel='rbf') svr.fit(X, y) pred = svr.predict(X) rmse = np.sqrt(MSE(y, pred)) print('Model 1 - SVR inclusive of all features') print('RMSE:', rmse) print('MAE:', mean_absolute_error(y, pred)) print('MAE%:', mean_absolute_percentage_error(y, pred)) fdf = pd.DataFrame(X.columns.tolist(), columns=['Features']) perm_importance = permutation_importance(svr, X, y) sorted_idx = perm_importance.importances_mean.argsort() a = pd.DataFrame(X.columns[sorted_idx],columns=['Features']) a['SVR Permutation Score'] = perm_importance.importances_mean[sorted_idx] fdf = fdf.merge(a,how='inner',on='Features') rdf = pd.DataFrame() rdf['SVR Sales'] = pred # Model 2 - XGBoost inclusive of all features y = new_df['valueSales'] X = new_df.loc[:, new_df.columns != 'valueSales'] xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 200, seed = 123) xgb_r.fit(X, y) pred = xgb_r.predict(X) rmse = np.sqrt(MSE(y, pred)) print('Model 2 - XGBoost inclusive of all features') print('RMSE:', rmse) print('MAE:', mean_absolute_error(y, pred)) print('MAE%:', mean_absolute_percentage_error(y, pred)) fdf['XGBoost Feature Importance'] = xgb_r.feature_importances_ explainer = shap.TreeExplainer(xgb_r) shap_values = explainer.shap_values(X) sdf = pd.DataFrame((zip(X.columns[np.argsort(np.abs(shap_values).mean(0))][::-1],np.sort(-np.abs(shap_values).mean(0)))),columns=["Features", "XGBoost SHAP"]) fdf = fdf.merge(sdf, how='inner', on='Features') rdf['XGBoost Sales'] = pred # Model 3 - SVR without unit sales and value sales/ACV y = new_df['valueSales'] X = new_df.loc[:, new_df.columns != 'valueSales'] X = X.drop(columns=['unitSales', 'valueSales/ACV']) X = X.fillna(0) svr = SVR(kernel='rbf') svr.fit(X, y) pred = svr.predict(X) rmse = np.sqrt(MSE(y, pred)) print('Model 3 - SVR without unit sales and value sales/ACV') print('RMSE:', rmse) print('MAE:', mean_absolute_error(y, pred)) print('MAE%:', mean_absolute_percentage_error(y, pred)) perm_importance = permutation_importance(svr, X, y) sorted_idx = perm_importance.importances_mean.argsort() a = pd.DataFrame(X.columns[sorted_idx],columns=['Features']) a['SVRD Permutation Score'] = perm_importance.importances_mean[sorted_idx] fdf = fdf.merge(a,how='left',on='Features') rdf['SVRD Sales'] = pred # Model 4 - XGBoost without unit sales and value sales/ACV y = new_df['valueSales'] X = new_df.loc[:, new_df.columns != 'valueSales'] X = X.drop(columns=['unitSales', 'valueSales/ACV']) xgb_r = xg.XGBRegressor(objective ='reg:squarederror', n_estimators = 200, seed = 123) xgb_r.fit(X, y) pred = xgb_r.predict(X) rmse = np.sqrt(MSE(y, pred)) print('Model 4 - XGBoost without unit sales and value sales/ACV') print('RMSE:', rmse) print('MAE:', mean_absolute_error(y, pred)) print('MAE%:', mean_absolute_percentage_error(y, pred)) sorted_idx = xgb_r.feature_importances_.argsort() a = pd.DataFrame(X.columns[sorted_idx],columns=['Features']) a['XGBoostD Feature Importance'] = xgb_r.feature_importances_[sorted_idx] explainer = shap.TreeExplainer(xgb_r) shap_values = explainer.shap_values(X) sdf = pd.DataFrame((zip(X.columns[np.argsort(np.abs(shap_values).mean(0))][::-1],np.sort(-np.abs(shap_values).mean(0)))),columns=["Features", "XGBoostD SHAP"]) fdf = fdf.merge(sdf, how='left', on='Features') rdf['XGBoostD Sales'] = pred # Reversing the standardization of sales rdf['SVR reversed'] = (rdf['SVR Sales']*df['valueSales'].std()) + df['valueSales'].mean() rdf['XGBoost reversed'] = (rdf['XGBoost Sales']*df['valueSales'].std()) + df['valueSales'].mean() rdf['SVRD reversed'] = (rdf['SVRD Sales']*df['valueSales'].std()) + df['valueSales'].mean() rdf['XGBoostD reversed'] = (rdf['XGBoostD Sales']*df['valueSales'].std()) + df['valueSales'].mean() rdf['Original Sales'] = df['valueSales'].tolist() rdf['prodDesc'] = df['prodDesc'].tolist() # Aggregating the rdf based on the product description to rank each SKU based on predicted sales rdf = rdf.groupby('prodDesc').agg({'SVR reversed':'sum', 'Original Sales':'sum', 'XGBoost reversed':'sum', 'XGBoostD reversed':'sum', 'SVRD reversed':'sum'}).reset_index() rdf['orginial rank'] = rdf['Original Sales'].rank(ascending=False) rdf['prediciton SVR rank'] = rdf['SVR reversed'].rank(ascending=False) rdf['prediciton SVRD rank'] = rdf['SVRD reversed'].rank(ascending=False) rdf['prediciton XGBoostD rank'] = rdf['XGBoostD reversed'].rank(ascending=False) rdf['prediciton XGBoost rank'] = rdf['XGBoost reversed'].rank(ascending=False) rdf.sort_values(by='Original Sales').head(20) # To understand the wellness of fit mu, std = norm.fit(rdf['Original Sales'] - rdf['SVR reversed']) plt.hist(rdf['Original Sales'] - rdf['SVR reversed'], density=True) xmin, xmax = plt.xlim() x = np.linspace(xmin, xmax, 100) p = norm.pdf(x, mu, std) plt.plot(x, p, 'k', linewidth=2) plt.xlabel("Original Sales - Predicted Sales (Residuals)") plt.show() # Saving the ranking and feature importance rdf.to_csv("SKU Ranking.csv") fdf.to_csv("Feature Importance.csv") # Aggregating the original data to find the product mix df['Total Trade Investment'] = df['Total Trade Investment'].astype('float') sub_df = df.groupby('prodDesc').agg({'valueSales':'sum', 'unitSales':'sum','unitPrice':'mean', 'target customers':'mean', 'SubSegment.full':pd.Series.mode, 'valueSales/SubSeg':pd.Series.mode, 'Invesment/valueSales':'sum', 'WalkAway':'mean', 'KSwitchIndex':pd.Series.mode, 'CompSwitchIndex':pd.Series.mode,'cluster':'mean','valueSales/ACV':'mean','similar private label SKU':pd.Series.mode, 'discount':'mean', 'noDiscountSales':'sum'}).reset_index() sub_df['trend'] = (p1['p2'] - p1['p1']) / p1['p2'] sub_df['% Non Discounted Sales'] = sub_df['noDiscountSales'] / sub_df['valueSales'] sub_df = sub_df.sort_values(by='valueSales', ascending=False) sub_df['cumulative valueSales'] = sub_df['valueSales'].cumsum() sub_df['% sales'] = sub_df['cumulative valueSales'] / sub_df['valueSales'].sum() sub_df['prodSales within SubSeg (%)'] = (sub_df['valueSales']*100) / sub_df['valueSales/SubSeg'] sub_df = sub_df.sort_values(by='valueSales') sub_df.to_csv("Product Mix Data.csv") # Standardizing the product mix for better business analysis standardized_sub_df=(sub_df.select_dtypes(include='number')-sub_df.select_dtypes(include='number').mean())/(sub_df.select_dtypes(include='number').std()) standardized_sub_df.head() factorized_sub_df = sub_df.select_dtypes(exclude='number') factorized_sub_df sub_df = pd.concat([standardized_sub_df, factorized_sub_df], axis=1) sub_df['valueSales Rank'] = [i for i in range(113, 0, -1)] sub_df.to_csv("Product Mix Data Standardized.csv")