Analyzing my Amazon Purchase History

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
from datetime import datetime
%matplotlib notebook

Read in raw data, break out day/month/year, fix currency strings

  • Note use of .strftime("%B") to get month names
  • .apply() with a lambda that returns a pd.Series to generate multiple new columns at once
  • fix 'Item Total' column to make dollar amounts floats
In [5]:
df = pd.read_csv('data/amazon-redacted.csv')
df[['year','month','month_str','day']] = df.apply(lambda row: pd.Series([
    datetime.strptime(row['Order Date'], '%m/%d/%y').year,
    datetime.strptime(row['Order Date'], '%m/%d/%y').month,
    datetime.strptime(row['Order Date'], '%m/%d/%y').strftime('%B'),
    datetime.strptime(row['Order Date'], '%m/%d/%y').day
]),
                                     axis=1)
df['Item Total'] = \
    df['Item Total'].replace('[\$,)]','', regex=True).astype(float)
df.head()
Out[5]:
Order Date Category Website Condition List Price Per Unit Purchase Price Per Unit Quantity Item Subtotal Item Subtotal Tax Item Total Tax Exemption Applied Tax Exemption Type Exemption Opt-Out Currency Group Name year month month_str day
0 1/5/19 Health and Beauty Amazon.com new $10.25 $9.70 1 $9.70 $0.58 10.28 NaN NaN NaN USD NaN 2019 1 January 5
1 1/5/19 NaN Amazon.com new $11.99 $11.99 1 $11.99 $0.00 11.99 NaN NaN NaN USD NaN 2019 1 January 5
2 1/5/19 Misc. Amazon.com new $0.00 $6.00 1 $6.00 $0.00 6.00 NaN NaN NaN USD NaN 2019 1 January 5
3 1/5/19 Misc. Amazon.com new $0.00 $14.52 1 $14.52 $0.00 14.52 NaN NaN NaN USD NaN 2019 1 January 5
4 1/5/19 Tools & Home Improvement Amazon.com new $0.00 $43.99 1 $43.99 $0.00 43.99 NaN NaN NaN USD NaN 2019 1 January 5

Define a set of useful column names in a good order

In [6]:
useful=['year', 'month_str', 'day', 'Category', 'Item Total']
df[useful].head()
Out[6]:
year month_str day Category Item Total
0 2019 January 5 Health and Beauty 10.28
1 2019 January 5 NaN 11.99
2 2019 January 5 Misc. 6.00
3 2019 January 5 Misc. 14.52
4 2019 January 5 Tools & Home Improvement 43.99

At this point I can plot my monthly purchase total

In [7]:
datetime.strptime('1',"%m").strftime('%B')
Out[7]:
'January'
In [8]:
month_totals = df.groupby('month').sum().reset_index()

fig, ax = plt.subplots(figsize=(8,6), tight_layout=True)
sns.barplot(x='month',
            y='Item Total',
            facecolor='xkcd:cadet blue',
            data=month_totals,
            ax=ax)
plt.ylabel('Total Amazon Orders ($)')
ax.set_xticklabels(month_totals['month'].apply(lambda m: datetime.strptime(str(m),"%m").strftime('%B')));

What if this is skewed by large purchases?

Use .groupby() and .idxmax() to find most expensive item purchased each month

In [9]:
df.loc[df.groupby('month').idxmax()['Item Total'], useful]
Out[9]:
year month_str day Category Item Total
4 2019 January 5 Tools & Home Improvement 43.99
35 2019 February 3 Health and Beauty 29.99
60 2019 March 18 Baby Product 105.95
83 2019 April 27 Tools & Home Improvement 60.60
100 2019 May 27 Kitchen 41.99
108 2019 June 2 Paperback 100.28
178 2019 July 31 Baby Product 211.99
186 2019 August 3 Electronics 185.49
228 2019 September 6 Baby Product 205.64
In [10]:
df_low = df[df['Item Total']<100]
adjusted_month_totals = df_low.groupby('month').sum().reset_index()

fig, ax = plt.subplots(figsize=(8,6), tight_layout=True)
sns.barplot(x='month',
            y='Item Total',
            facecolor='xkcd:cadet blue',
            data=adjusted_month_totals,
            ax=ax)
plt.ylabel('Total Amazon Orders ($)')
ax.set_xticklabels(month_totals['month'].apply(lambda m: datetime.strptime(str(m),"%m").strftime('%B')));

That didn't really change the picture

Just for kicks let's look at the order histogram for each month

In [11]:
fig, axes = plt.subplots(int(np.ceil(df['month'].nunique()/2)),
                         2,
                         figsize=(10,15),
                         tight_layout=True)
for month,month_str,ax in zip(df['month'].unique(), df['month_str'].unique(), axes.ravel()):
    sns.distplot(df.query('month==@month')['Item Total'],
                 bins=20,
                 ax=ax,
                 kde=False,
                 rug=True)
    plt.sca(ax)
    plt.xlabel('Item Price ($)')
    plt.ylabel('Number of Items')
    plt.title(month_str)