'''
TODO:
1. Load the aggregated sales data from CSV
2. Load category mappings from Excel and merge on 'Item'
   - Drop 'Main Item Names' after merge
3. Load price change data from Excel and merge on 'Item'
   - Remove any duplicated columns if created
4. Load store address data and merge on 'Brand/Outlet'
   - Drop 'Outlet/Listings' after merge
5. Drop any columns with the '_dup' suffix from all merges
6. Preview the final merged DataFrame with print(df.head())
'''
import pandas as pd

# Load the Aggregated Data
print('--- Loaded Aggregated Data ---')
df = pd.read_csv('Aggregated Data.csv')

# Load Category Mapping
print('--- Loaded Category Mapping ---')
category_mapping = pd.read_excel('category mapping.xlsx')

df = df.merge(
    category_mapping,
    left_on='Item',
    right_on='Main Item Names',
    how='left',
    suffixes=('', '_dup')
)

df.drop(columns=['Main Item Names'], inplace=True)

# Load Price Change
price_change = pd.read_excel('price change.xlsx')

df = df.merge(
    price_change,
    on='Item',
    how='left',
    suffixes=('', '_dup')
)

# Drop duplicated columns (if any were suffixed during merge)
df = df.loc[:, ~df.columns.duplicated()]

# Load Store Mapping
stores_mapping = pd.read_excel('stores mapping.xlsx')

df = df.merge(
    stores_mapping,
    left_on='Brand/Outlet',
    right_on='Outlet/Listings',
    how='left',
    suffixes=('', '_dup')
)

df.drop(columns=['Outlet/Listings'], inplace=True)

# Final cleanup: drop all columns that end in "_dup"
df = df.loc[:, ~df.columns.str.endswith('_dup')]

# Sample output
print('--- Sample Output ---')
print(df.head())

--- Loaded Aggregated Data ---
--- Loaded Category Mapping ---
--- Sample Output ---
                       Brand/Outlet                              Item  \
0  XXX SG - Duo (Extended Delivery)  Banana Cake with Acai Superfruit   
1  XXX SG - Duo (Extended Delivery)  Banana Cake with Acai Superfruit   
2  XXX SG - Duo (Extended Delivery)  Banana Cake with Acai Superfruit   
3  XXX SG - Duo (Extended Delivery)  Banana Cake with Acai Superfruit   
4  XXX SG - Duo (Extended Delivery)  Banana Cake with Acai Superfruit   

    Platform      Date  Quantity Category  Old Price  New Price  \
0  DELIVEROO  1-Jan-22         0      NaN        NaN        NaN   
1  DELIVEROO  2-Jan-22         0      NaN        NaN        NaN   
2  DELIVEROO  3-Jan-22         0      NaN        NaN        NaN   
3  DELIVEROO  4-Jan-22         0      NaN        NaN        NaN   
4  DELIVEROO  5-Jan-22         0      NaN        NaN        NaN   

                                Address  Postal Code  
0  7 Fraser Street,#01-58, DUO Galleria       189356  
1  7 Fraser Street,#01-58, DUO Galleria       189356  
2  7 Fraser Street,#01-58, DUO Galleria       189356  
3  7 Fraser Street,#01-58, DUO Galleria       189356  
4  7 Fraser Street,#01-58, DUO Galleria       189356

"""
TODO:
1. Load the transaction CSV file into a DataFrame
2. Print the original DataFrame
3. Remove the first row (index 0) if it contains unnecessary placeholder/header data
4. Reset the index to maintain a clean sequence
5. Print the cleaned DataFrame
"""

import pandas as pd

# Load the transaction CSV file
df = pd.read_csv('Transaction Data.csv')

print('--- Original DataFrame ---')
print(df)

# Drop the first row (index 0)
df = df.drop(index=0).reset_index(drop=True)

print('--- DataFrame After Dropping First Row ---')
print(df.head())

"""
TODO:
1. Load the transaction CSV file into a DataFrame
2. Calculate the percentage of missing (NaN) values for each column
3. Display the missing value percentages in descending order
"""

import pandas as pd

# Load the transaction CSV file
df = pd.read_csv('Transaction Data.csv')

# Calculate percentage of missing values per column
missing_percentages = df.isna().mean() * 100

# Display the percentages in descending order
print('--- Percentage of Missing Values by Column ---')
print(missing_percentages.sort_values(ascending=False))

'''
TODO:
- Read the transaction csv file
- Find columns with more than 50% missing values
- Drop those columns
'''

import pandas as pd

data = pd.read_csv('Transaction Data.csv')

# Identify columns with more than 50% missing values BEFORE dropping
removed_cols = data.columns[data.isna().mean() > 0.5]

# Now drop those columns
data = data.loc[:, data.isna().mean() <= 0.5]

# Print the removed columns
print("Removed columns due to >50% missing data:", removed_cols.tolist())

'''
TODO:
1. Load the transaction data from 'Transaction Data.csv'
2. Identify and drop columns with more than 50% missing values
   - Print the names of these removed columns
3. For the remaining columns:
   a. Fill missing values in numeric columns with 0
      - Suitable for financial fields like discounts and fees
   b. Fill missing values in object (string) columns with 'Unknown'
      - Maintains usability for grouping or filtering
4. Print the number of remaining missing values per column (should be 0)
'''

import pandas as pd

# Load data
data = pd.read_csv('Transaction Data.csv')

# Step 1: Drop columns with >50% missing values
removed_cols = data.columns[data.isna().mean() > 0.5]
data = data.loc[:, data.isna().mean() <= 0.5]
print("Removed columns due to >50% missing data:", removed_cols.tolist())

# Step 2: Handle remaining missing values

# Fill numeric columns with 0 (assuming financial values like discounts or fees)
numeric_cols = data.select_dtypes(include='number').columns
data[numeric_cols] = data[numeric_cols].fillna(0)

# Fill object (string) columns with "Unknown" or a placeholder
object_cols = data.select_dtypes(include='object').columns
data[object_cols] = data[object_cols].fillna('Unknown')

# Optionally: confirm no more missing values
print("Remaining missing values per column:")
print(data.isna().sum())

'''
TODO:
1. Inspect the current data types of all columns in the transaction data
2. Identify any incorrect types (e.g. dates stored as strings)
3. Convert:
   a. 'day_start' → datetime
   b. 'created_datetime' → datetime
4. Confirm the changes by printing the updated data types
'''

import pandas as pd

# Load the dataset (if not already loaded)
df = pd.read_csv('Transaction Data.csv')

# Check original data types
print("Original Data Types:\n", df.dtypes)

# Convert date columns to datetime
df['day_start'] = pd.to_datetime(df['day_start'], errors='coerce')
df['created_datetime'] = pd.to_datetime(df['created_datetime'], errors='coerce')

# Check updated data types
print("\nUpdated Data Types:\n", df.dtypes)

Original Data Types:
 Unnamed: 0                   int64
day_start                   object
created_datetime            object
brand_name                  object
outlet                      object
orderid                     object
order_delivery_platform     object
order_subtotal             float64
order_discount             float64
order_delivery_fee         float64
order_fulfillment_mode      object
item_id                     object
item_name                   object
item_price                 float64
item_quantity                int64
modifier_name               object
modifier_price             float64
modifier_quantity            int64
dtype: object

Updated Data Types:
 Unnamed: 0                          int64
day_start                  datetime64[ns]
created_datetime           datetime64[ns]
brand_name                         object
outlet                             object
orderid                            object
order_delivery_platform            object
order_subtotal                    float64
order_discount                    float64
order_delivery_fee                float64
order_fulfillment_mode             object
item_id                            object
item_name                          object
item_price                        float64
item_quantity                       int64
modifier_name                      object
modifier_price                    float64
modifier_quantity                   int64
dtype: object

'''
TODO:

1. Import the pandas library.
2. Create a sample DataFrame with a 'day_start' column containing datetime values.
3. Convert the 'day_start' column to datetime format (if not already).
4. Extract the month name from 'day_start' and store it in a new column 'month_name'.
5. Extract the day of the week from 'day_start' and store it in a new column 'day_of_week'.
6. Extract the year from 'day_start' and store it in a new column 'year'.
7. Display the first few rows of the DataFrame with the new time-based columns.
'''

import pandas as pd

# Sample DataFrame with a 'day_start' column
data = {
    'day_start': pd.to_datetime([
        '2024-01-15', '2024-02-20', '2024-03-25',
        '2024-04-30', '2024-05-05'
    ])
}

df = pd.DataFrame(data)

# Create time-based features
df['month_name'] = df['day_start'].dt.month_name()
df['day_of_week'] = df['day_start'].dt.day_name()
df['year'] = df['day_start'].dt.year

# Preview the new columns
print(df[['day_start', 'month_name', 'day_of_week', 'year']].head())

'''
TODO:
1. Import the pandas library.
2. Create a sample DataFrame with some duplicate rows.
3. Identify the number of duplicate rows using df.duplicated().
4. Print the count of duplicate rows.
5. If duplicates exist:
   a. Remove them using df.drop_duplicates().
   b. Print a message confirming removal.
6. Print the new shape of the DataFrame to confirm the duplicates are gone.
'''

import pandas as pd

# Sample data with intentional duplicate
data = {
    'TransactionID': [1, 2, 3, 3],
    'Item': ['Apple', 'Banana', 'Orange', 'Orange'],
    'Amount': [1.0, 2.5, 3.0, 3.0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Step 1: Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Step 2: Remove duplicates if any
if duplicate_count > 0:
    df = df.drop_duplicates()
    print("Duplicates removed.")

# Step 3: Confirm new shape
print(f"New dataset shape: {df.shape}")

Number of duplicate rows: 1
Duplicates removed.
New dataset shape: (3, 3)

'''
# TODO:

1. Load the transaction dataset and parse the datetime column.
2. Extract the month from the datetime for grouping.
3. Group data by month and delivery platform, summing up the item quantities.
4. Identify the platform with the highest quantity sold for each month.
5. Visualize the result using a bar chart, annotating each bar with the top platform name.
'''

import pandas as pd
import matplotlib.pyplot as plt

# Load and parse the transaction data
df = pd.read_csv('Transaction Data.csv')
df['created_datetime'] = pd.to_datetime(df['created_datetime'], errors='coerce')

# Create a new column for 'month'
df['month'] = df['created_datetime'].dt.to_period('M')

# Group by month and platform, summing quantities
monthly_platform_sales = (
    df.groupby(['month', 'order_delivery_platform'])['item_quantity']
    .sum()
    .reset_index()
)

# Identify the top platform per month
top_platforms = monthly_platform_sales.loc[
    monthly_platform_sales.groupby('month')['item_quantity'].idxmax()
]

# Plot the top platforms
plt.figure(figsize=(12, 6))
plt.bar(top_platforms['month'].astype(str), top_platforms['item_quantity'], color='skyblue')
plt.xlabel('Month')
plt.ylabel('Total Quantity Sold')
plt.title('Top Platform by Quantity Sold for Each Month')
plt.xticks(rotation=45)

# Annotate with platform names
for i, row in top_platforms.iterrows():
    plt.text(row['month'].strftime('%Y-%m'), row['item_quantity'] + 1, row['order_delivery_platform'],
             ha='center', va='bottom', fontsize=8, rotation=90)

plt.tight_layout()
plt.show()

'''
TODO:

1. Extract month from the created_datetime column.
2. Group data by month and outlet, summing the item_quantity.
3. Identify the top 5 outlets by overall quantity sold.
4. Filter to keep only those top 5 stores.
5. Pivot the data to reshape for line plotting.
6. Plot monthly trends for the top 5 outlets.
'''

import pandas as pd
import matplotlib.pyplot as plt

# Ensure datetime and month columns exist
df['created_datetime'] = pd.to_datetime(df['created_datetime'], errors='coerce')
df['month'] = df['created_datetime'].dt.to_period('M')

# Group by month and outlet
monthly_store_sales = (
    df.groupby(['month', 'outlet'])['item_quantity']
    .sum()
    .reset_index()
)

# Identify top 5 outlets by total quantity
top_5_outlets = (
    monthly_store_sales.groupby('outlet')['item_quantity']
    .sum()
    .nlargest(5)
    .index
)

# Filter data to only top 5 outlets
top_5_data = monthly_store_sales[monthly_store_sales['outlet'].isin(top_5_outlets)]

# Pivot for plotting
plot_data = top_5_data.pivot(index='month', columns='outlet', values='item_quantity').fillna(0)

# Plot the trends
plot_data.plot(kind='line', marker='o', figsize=(12, 6))
plt.title('Monthly Quantity Sold Trend for Top 5 Stores')
plt.xlabel('Month')
plt.ylabel('Quantity Sold')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend(title='Outlet', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

'''
TODO:

1. Load the category mapping file.
2. Merge it with the main transaction DataFrame using item_name as the key.
3. Replace NaN values in the 'Category' column with "Uncategorized".
4. Group by Category and sum the item_quantity.
5. Calculate percentage share of each category.
6. Plot the category distribution using a pie chart.
'''

import pandas as pd
import matplotlib.pyplot as plt

# Load the category mapping file
category_mapping = pd.read_excel("category mapping.xlsx")  # adjust path if needed

# Merge with transaction data
df = df.merge(
    category_mapping,
    how='left',
    left_on='item_name',
    right_on='Main Item Names'
)

# Fill missing categories
df['Category'] = df['Category'].fillna('Uncategorized')

# Group by category and sum quantities
category_sales = (
    df.groupby('Category')['item_quantity']
    .sum()
    .reset_index()
)

# Calculate percentages
total_quantity = category_sales['item_quantity'].sum()
category_sales['percentage'] = (category_sales['item_quantity'] / total_quantity) * 100

# Plot as pie chart
plt.figure(figsize=(10, 8))
plt.pie(
    category_sales['percentage'],
    labels=category_sales['Category'],
    autopct='%1.1f%%',
    startangle=140
)
plt.title('Sales Distribution by Category (Based on Quantity)')
plt.axis('equal')
plt.tight_layout()
plt.show()

'''
TODO:

1. Extract the date and day of the week from the created_datetime column.
2. Group by date and platform to get total daily quantity sold.
3. Classify each day as 'Weekday' or 'Weekend' based on the day of the week.
4. Compute the average daily quantity sold per platform for weekdays and weekends.
5. Create a grouped bar chart to visualize platform performance on weekdays vs weekends.
'''

import pandas as pd
import matplotlib.pyplot as plt

# Extract date and day of week
df['date'] = df['created_datetime'].dt.date
df['day_of_week'] = df['created_datetime'].dt.day_name()

# Daily total quantity sold per platform
daily_sales = (
    df.groupby(['date', 'order_delivery_platform'])['item_quantity']
    .sum()
    .reset_index()
)

# Tag each date as 'Weekday' or 'Weekend'
daily_sales['day_of_week'] = pd.to_datetime(daily_sales['date']).dt.dayofweek
daily_sales['day_type'] = daily_sales['day_of_week'].apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')

# Compute average daily quantity sold
avg_sales = (
    daily_sales.groupby(['order_delivery_platform', 'day_type'])['item_quantity']
    .mean()
    .reset_index()
)

# Pivot for plotting
pivot_avg = avg_sales.pivot(index='order_delivery_platform', columns='day_type', values='item_quantity')

# Plot the grouped bar chart
pivot_avg.plot(kind='bar', figsize=(10, 6))
plt.title('Average Daily Quantity Sold: Weekday vs Weekend (by Platform)')
plt.xlabel('Platform')
plt.ylabel('Average Quantity Sold per Day')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

'''
TODO:

1. Top platform by quantity sold per month – bar chart
2. Top 5 outlet monthly trends – line chart
3. Sales distribution by category – pie chart
4. Weekday vs Weekend average quantity per platform – grouped bar chart
'''

import matplotlib.pyplot as plt

# --- Chart 1: Top Platform by Quantity Sold per Month ---
plt.figure(figsize=(12, 6))
plt.bar(top_platforms["month"].astype(str), top_platforms["item_quantity"], color="skyblue")
plt.xlabel("Month")
plt.ylabel("Total Quantity Sold")
plt.title("Top Platform by Quantity Sold for Each Month")
plt.xticks(rotation=45)
for i, (month, qty, platform) in top_platforms[["month", "item_quantity", "order_delivery_platform"]].iterrows():
    plt.text(str(month), qty + 1, platform, ha='center', va='bottom', fontsize=8, rotation=90)
plt.tight_layout()
plt.show()


# --- Chart 2: Top 5 Outlets Monthly Trend ---
plot_data.plot(kind='line', marker='o', figsize=(12, 6))
plt.title('Monthly Quantity Sold Trend for Top 5 Outlets')
plt.xlabel('Month')
plt.ylabel('Total Quantity Sold')
plt.grid(True)
plt.xticks(rotation=45)
plt.legend(title='Outlet', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()


# --- Chart 3: Category Distribution Pie Chart ---
plt.figure(figsize=(10, 8))
plt.pie(
    category_sales['percentage'],
    labels=category_sales['Category'],
    autopct='%1.1f%%',
    startangle=140
)
plt.title("Sales Distribution by Category (Based on Quantity)")
plt.axis('equal')
plt.tight_layout()
plt.show()


# --- Chart 4: Weekday vs Weekend Comparison by Platform ---
pivot_avg.plot(kind='bar', figsize=(10, 6))
plt.title("Average Daily Quantity Sold: Weekday vs Weekend (by Platform)")
plt.xlabel("Delivery Platform")
plt.ylabel("Average Quantity Sold Per Day")
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

'''
TODO:

1. Load the transaction data
2. Create two dataframes:
  - df_items with columns: orderid, item_name, item_qty, item_price
  - df_modifiers with columns: orderid, modifier_name, modifier_qty, modifier_price
'''

import pandas as pd

# Load the CSV
transaction_data = pd.read_csv("Transaction Data.csv")

# Create the first dataframe for item details
df_items = transaction_data[['orderid', 'item_name', 'item_quantity', 'item_price']].copy()

# Create the second dataframe for modifier details
df_modifiers = transaction_data[['orderid', 'modifier_name', 'modifier_quantity', 'modifier_price']].copy()

# Display the first few rows of each dataframe
print("Items DataFrame:")
print(df_items.head())

print("\nModifiers DataFrame:")
print(df_modifiers.head())

Items DataFrame:
  orderid item_name  item_quantity  item_price
0  GF-696    Kokoro              1        15.9
1    5116     Salad              1        11.9
2    5116     Salad              1        11.9
3    5116     Salad              1        11.9
4    5116     Salad              1        11.9

Modifiers DataFrame:
  orderid         modifier_name  modifier_quantity  modifier_price
0  GF-696     Warm Protein Bowl                  1             0.0
1    5116          Black Olives                  1             0.0
2    5116          French Beans                  1             0.0
3    5116  Warm Baked Mushrooms                  1             2.9
4    5116          Baby Spinach                  1             3.0

'''
TODO:

1. Standardize column names between item_df and modifier_df for appending.
2. Add a new column to distinguish item vs modifier rows.
3. Append item_df and modifier_df to form df_temp.
'''

# Step 1: Rename columns to match for both DataFrames
item_df_renamed = item_df.rename(columns={
    'item_name': 'name',
    'item_quantity': 'quantity',
    'item_price': 'price'
})
item_df_renamed['type'] = 'item'

modifier_df_renamed = modifier_df.rename(columns={
    'modifier_name': 'name',
    'modifier_quantity': 'quantity',
    'modifier_price': 'price'
})
modifier_df_renamed['type'] = 'modifier'

# Step 2: Combine the DataFrames
df_temp = pd.concat([item_df_renamed, modifier_df_renamed], ignore_index=True)

# Display first few rows
print("--- Combined df_temp ---")
display(df_temp.head())

'''
TODO:

- Standardize column names between df_items and df_modifiers
- Append them into a single dataframe called df_temp
'''

# Rename columns to match for appending
df_items_renamed = df_items.rename(columns={
    'item_name': 'name',
    'item_qty': 'qty',  # Replace with correct name if needed
    'item_price': 'price'
})

df_modifiers_renamed = df_modifiers.rename(columns={
    'modifier_name': 'name',
    'modifier_qty': 'qty',
    'modifier_price': 'price'
})

# Append the dataframes
df_temp = pd.concat([df_items_renamed, df_modifiers_renamed], ignore_index=True)

# Display result
print(df_temp.head())

  orderid    name  item_quantity  price  modifier_quantity
0  GF-696  Kokoro            1.0   15.9                NaN
1    5116   Salad            1.0   11.9                NaN
2    5116   Salad            1.0   11.9                NaN
3    5116   Salad            1.0   11.9                NaN
4    5116   Salad            1.0   11.9                NaN

'''
TODO:

1. Use df_temp to prepare data for one-hot encoding.
2. Set orderid as index and product names as columns.
3. Mark with 1 if product was purchased in that order, else 0.
'''

# Step 1: Ensure 'orderid' and 'name' columns exist
df_encoded = df_temp[['orderid', 'name']].copy()

# Step 2: Add a dummy column to indicate purchase
df_encoded['purchased'] = 1

# Step 3: Pivot table to create the desired binary matrix
pivot_df = df_encoded.pivot_table(
    index='orderid',
    columns='name',
    values='purchased',
    aggfunc='max',  # In case duplicates exist
    fill_value=0
)

# Step 4: Display result
print("--- Encoded Transaction Matrix ---")
display(pivot_df.head())

--- Encoded Transaction Matrix ---

	day_start	month_name	day_of_week	year
0	2022-03-01	March	Tuesday	2022.0
1	2022-03-01	March	Tuesday	2022.0
2	2022-03-01	March	Tuesday	2022.0
3	2022-03-01	March	Tuesday	2022.0
4	2022-03-01	March	Tuesday	2022.0

name	1. Open your barePack app + select this restaurant in the list + enter container quantity to trigger OTP.	2. Paste the OTP into Deliveroo 'Order Notes' (at check-out).	3. The restaurant will verify your order via the OTP + pack in barePack reusable container(s).	Acai Glory	Acai Superfruit Banana Cake	Alkaline Water	Almond	Almond Butter	Apple	Arabian Spiced Vinaigrette	...	Wrap	Wrap Meal	Yellow Raisin	Yogurt with Granola	Yogurt with Granola (Almond)	Yogurt with Granola (Blueberry)	Yogurt with Granola (Greek)	Yogurt with Granola (Raspberry)	barePack FlexBox [barePack member]	x2 Super Protein Patty (Vegan)
orderid
0	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
10	0	0	0	0	0	0	0	0	0	0	...	1	1	0	0	0	0	0	0	0	0
100	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1000	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0

Data Preprocessing [12%]¶

Qn #1 & #2¶

Qn #3¶

Qn #4¶

Qn #5¶

Qn #6¶

Qn #7¶

Qn #8¶

Qn #9¶

Exploratory Data Analysis [25%]¶

Qn #1¶

Qn #2¶

Qn #3¶

Qn #4¶

Qn #5¶

Data Aggregation / Transformation [13%]¶

Qn #1¶

Qn #2¶

Qn #3¶

Qn #4¶