# Imports import datetime import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.feature_extraction.text import TfidfVectorizer # Read the POS data pos_data = pd.read_csv('1900uUKCereals2023.csv') # Change dateEnding column format pos_data['dateEnding'] = pd.to_datetime(pos_data['dateEnding'], format='%Y%m%d') # Period patterns week_patterns = [4, 4, 5] periods = [] pattern_index = 0 start_date = datetime.datetime.strptime('01-01', '%m-%d') # Generate period dictionary for i in range(12): weeks = week_patterns[pattern_index % len(week_patterns)] end_date = start_date + datetime.timedelta(weeks=weeks) - datetime.timedelta(days=1) if pattern_index < 11 else start_date + datetime.timedelta(weeks=weeks) # Start and end dates into the dictionary start_end_period = { 'start_date': start_date.strftime('%m-%d'), 'end_date': end_date.strftime('%m-%d') } periods.append(start_end_period) # Update the start date to end date for the next period start_date = end_date + datetime.timedelta(days=1) # Increement to follow the cyclic 4-4-5 pattern pattern_index += 1 # Function to clssify the dates into a period def classify_dates_to_periods(date): date = date.strftime('%m-%d') for i, period_dates in enumerate(periods): start_date = period_dates['start_date'] end_date = period_dates['end_date'] if start_date <= date <= end_date: return f'P{i+1}' # Calculate the period for POS data pos_data['period'] = pos_data['dateEnding'].apply(classify_dates_to_periods) pos_data['month_year'] = pos_data['dateEnding'].dt.to_period('M') pos_data['Size.full'] = pos_data['Size.full'].str.replace("GR", "").astype('float') pos_data['dateEnding_year'] = pos_data['dateEnding'].dt.year pos_data['dateEnding_month'] = pos_data['dateEnding'].dt.month pos_data['dateEnding_week'] = pos_data['dateEnding'].dt.isocalendar().week pos_data['dateEnding_year_special'] = pos_data['dateEnding_year'] # consider special case when date at beginning of January belongs to week from previous year, or opposite in December for index, row in pos_data.iterrows(): if row['dateEnding_month'] == 1 and row['dateEnding_week'] == 53: pos_data.at[index, 'dateEnding_year_special'] = row['dateEnding_year']-1 if row['dateEnding_month'] == 12 and row['dateEnding_week'] == 1: pos_data.at[index, 'dateEnding_year_special'] = row['dateEnding_year']+1 pos_data['Product_ID'] = pos_data['prodCodeGeo'].str.split(' ').str[0] # Create the unique ID to be able to match to the segments pos_data['Unique_ID'] = pos_data['Product_ID'].astype(str) + '_' + pos_data['dateEnding_year_special'].astype(str) + '_' + pos_data['dateEnding_week'].astype(str) segments = pos_data.groupby('SubSegment.full').agg({'valueSales':'sum', 'unitSales':'sum'}).reset_index() segments.columns = ['SubSegment.full', 'valueSales/SubSeg', 'unitSales/SubSeg'] pos_data = pos_data.merge(segments, how='inner', on='SubSegment.full') pos_data['valueSales/ACV'] = pos_data['valueSales'] / pos_data['ACVWdtDist'] pos_data['valueSales/ACV'].replace([np.inf, -np.inf], 0, inplace=True) temp = pos_data.groupby('prodDesc').agg({'unitPrice':lambda x: x.quantile(0.9)}).reset_index() temp = temp.rename(columns={'unitPrice':'90PUnitPrice'}) pos_data = pos_data.merge(temp, how='inner', on='prodDesc') pos_data['discount'] = np.where(pos_data['90PUnitPrice'] - pos_data['unitPrice'] > 0, 1, 0) pos_data['noDiscountSales'] = np.where(pos_data['discount'] == 0, pos_data['valueSales'], 0) pos_data.to_csv('Cleaned POS.csv', index=False) # Read Macroeconomic Data macroeconomic_data = pd.read_excel('uk_macro_economic.xlsx') macroeconomic_data['Time'] = pd.to_datetime(macroeconomic_data['Time'], format='%Y-%m-%d') macroeconomic_data['month_year'] = macroeconomic_data['Time'].dt.to_period('M') macroeconomic_data.to_csv('Cleaned Macroeconomic.csv', index=False) # Read Segment Data segment_data = pd.read_csv('cereal_segment_UPC_level.csv', encoding='latin1') # Remove null values and the irrelevant unnamed column in segment data segment_data = segment_data.dropna() segment_data = segment_data.drop(segment_data.columns[0],axis=1) segment_data['Promotional_week'] = pd.to_datetime(segment_data['Promotional_week']) segment_data['Year'] = segment_data['Promotional_week'].dt.year segment_data['Month'] = segment_data['Promotional_week'].dt.month segment_data['Week'] = segment_data['Promotional_week'].dt.isocalendar().week segment_data['Year_special'] = segment_data['Year'] for index, row in segment_data.iterrows(): if row['Month'] == 1 and row['Week'] == 53: segment_data.at[index, 'Year_special'] = row['Year']-1 if row['Month'] == 12 and row['Week'] == 1: segment_data.at[index, 'Year_special'] = row['Year']+1 # Read Quantium data for mapping purposes quantium_data = pd.read_csv('Quantium vs. IRI Mapping.csv') # Map segment data to the quantium IDs segment_data = pd.merge(segment_data, quantium_data[['prod_num', 'Quantium_ID']], left_on='Primary_EAN', right_on='Quantium_ID') # Create a similar unique ID like in POS data segment_data['Unique_ID'] = segment_data['prod_num'].astype(str) + '_' + segment_data['Year_special'].astype(str) + '_' + segment_data['Week'].astype(str) # Aggregate the new segment data based on the unique ID segment_data_pivot = segment_data.pivot_table(index='Unique_ID', columns='Price_Sensitivity', values='SUM(Customers)', aggfunc='sum') segment_data_pivot = segment_data_pivot.reset_index() segment_data_pivot.columns.name = None segment_data_pivot.columns = [f"{col} customers" if col != 'Unique_ID' else col for col in segment_data_pivot] segment_data_pivot['majority customers'] = segment_data_pivot['Price sensitive customers'] + segment_data_pivot['Mid market customers'] segment_data_pivot['minority customers'] = segment_data_pivot['Ultra price sensitive customers'] + segment_data_pivot['Upmarket customers'] segment_data_pivot['target customers'] = segment_data_pivot['majority customers'] / (segment_data_pivot['majority customers'] + segment_data_pivot['minority customers']) segment_data_pivot.to_csv('Cleaned Segment.csv', index=False) # Read IRI data and convert from excel to csv iri_mapping = pd.read_excel('trade_xref.xlsx', sheet_name='kellogg_iri') iri_mapping = iri_mapping.drop_duplicates(subset='prod_desc', keep='first') iri_mapping.to_csv('Cleaned Mapping.csv', index=False) # Read Trade Data trade = pd.read_csv('ASDATrade.csv') aggregate_funcs = { 'Total Trade Investment': 'sum', 'TOTAL O/I': 'sum', 'TOTAL Retro': 'sum', 'Cash Discount': 'sum', } trade[list(aggregate_funcs.keys())] = trade[list(aggregate_funcs.keys())].replace(',', '',regex=True).astype(float) aggregated = trade.groupby(['SKU Descr.','Period','Year','Brand Descr.', 'SkuCode', 'Category']) aggregated_trade = aggregated.agg(aggregate_funcs).reset_index() aggregated_trade = aggregated_trade.drop_duplicates(subset=['SKU Descr.','Period','Year']) aggregated_trade['Total Trade Investment'] = aggregated_trade['Total Trade Investment'].fillna('0') aggregated_trade.to_csv('Cleaned Trade.csv', index=False) # Read walk data walk = pd.read_excel('GB_SwitchingMatrix_GFKMethod_Cereals_v4.xlsx', sheet_name='WalkAwayRate') walk.rename(columns={'K': 'WalkAway'}, inplace=True) walk['SKU'] = walk['SKU'].str.strip() temp = pd.DataFrame(pos_data[(pos_data['geoCodeDesc'] == 'Asda') & (pos_data['BrnDedPrivLab.full'] == 'KELLOGGS')]['prodDesc'].unique(), columns=['prodDesc']) walk = temp.merge(walk, how='outer', left_on='prodDesc', right_on='SKU') walk = pd.concat([walk[(walk['prodDesc'].notna()) & (walk['SKU'].notna())], walk[walk['SKU'].isna()], walk[walk['SKU'].str.contains("KELLOGG") == False]], ignore_index=True) walk['SKU'] = walk['SKU'].fillna(walk['prodDesc']) walk = walk.drop(columns='prodDesc') walk['SKU1'] = walk['SKU'].str.replace("_", " ",regex=True) vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,2)) X = vectorizer.fit_transform(walk['SKU1']) # Optimal clusters found using elbow method kmeans = KMeans(n_clusters=45) kmeans.fit(X) walk['cluster'] = kmeans.predict(X) for i in walk.index.tolist(): if np.isnan(walk.loc[i, 'WalkAway']): cluster_id = walk.loc[i, 'cluster'] walk.loc[i, 'WalkAway'] = walk[walk['cluster'] == cluster_id]['WalkAway'].mean() walk.to_csv('Cleaned Walk.csv', index=False) # Read Switch Data switch = pd.read_excel('GB_SwitchingMatrix_GFKMethod_Cereals_v4.xlsx', sheet_name='Swithing') K_cols = [i for i in switch.columns if 'KELLOGG' in i] + ["Kellogg's Others"] comp_cols = [i for i in switch.columns if 'KELLOGG' not in i and "Kellogg's" not in i][1:] switch['KSwitchIndex'] = switch[K_cols].sum(axis=1) switch['CompSwitchIndex'] = switch[comp_cols].sum(axis=1) switch = switch[['SKU', 'KSwitchIndex', 'CompSwitchIndex']] switch['KSwitchIndex'] = switch['KSwitchIndex'].fillna(switch[switch['SKU'] == "Kellogg's Others"]['KSwitchIndex'][465]) switch['CompSwitchIndex'] = switch['CompSwitchIndex'].fillna(switch[switch['SKU'] == "Kellogg's Others"]['CompSwitchIndex'][465]) temp = (switch['KSwitchIndex'] + switch['CompSwitchIndex']) switch['KSwitchIndex'] = switch['KSwitchIndex'] / temp switch['CompSwitchIndex'] = switch['CompSwitchIndex'] / temp switch.to_csv('Cleaned Switch.csv', index=False)