Women Shoe Prices
An exploretory data analysis on women shoe price
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
WomenShoe = pd.read_csv('Women_Shoe_Price.csv', parse_dates =
['dateAdded', 'dateUpdated', 'prices.dateAdded', 'prices.dateSeen'] )
WomenShoe.shape
miss_val = (WomenShoe.isnull().sum()/len(WomenShoe)*100).sort_values(ascending=False)
miss_val[miss_val>0]
(miss_val[miss_val==0].index)
WomenShoe = WomenShoe.drop(columns=
['id', 'asins', 'count', 'descriptions',
'dimension','ean', 'flavors', 'isbn','manufacturer',
'manufacturerNumber', 'merchants', 'keys', 'imageURLs',
'prices.availability', 'prices.color', 'prices.condition',
'prices.count', 'prices.flavor', 'prices.merchant', 'prices.offer',
'prices.returnPolicy', 'prices.shipping', 'prices.size', 'prices.source',
'prices.warranty', 'quantities', 'reviews', 'skus', 'upc', 'websiteIDs',
'weight', 'prices.sourceURLs','prices.dateAdded', 'prices.dateSeen',
'dateUpdated', 'sizes', 'sourceURLs'] )
#Check the new dimension of the dataset
WomenShoe.shape
WomenShoe = WomenShoe.rename(columns={"prices.amountMin": "MinPrice",
"prices.amountMax": "MaxPrice", "prices.currency": "currency"})
WomenShoe.head(8)
Correct Data Types
Actually, the data types for dateAdded, MinPrice, MaxPrice, and prices.isSale are all object when I first process this dataset. I tried to convert MinPrice and MaxPrice to float, but it didn't work. I looked at the data in Excel, I found there are so many rows with data that are overlapped with other columns, so I had to go over those 19,034 rows to find and correct the data manually.
WomenShoe.info()
WomenShoe['average_price'] = (WomenShoe['MinPrice'] + WomenShoe['MaxPrice'])/2
#drop MinPrice and MaxPrice columns
WomenShoe.drop(columns=['MinPrice', 'MaxPrice'], axis=1, inplace=True)
WomenShoe.head() #9 columns left
WomenShoe.currency.value_counts()
def curr_change(x):
if x.currency == 'AUD':
x.average_price = x.average_price * 0.73
if x.currency == 'CAD':
x.average_price = x.average_price * 0.76
if x.currency == 'EUR':
x.average_price = x.average_price * 1.19
if x.currency == 'GBP':
x.average_price = x.average_price * 1.33
return x
WomenShoe[WomenShoe['currency'] != "USD"].head()
WomenShoe = WomenShoe.apply(curr_change, axis=1)
currency_change = WomenShoe[WomenShoe['currency'] != "USD"]
currency_change.head()
WomenShoe.brand.value_counts()
brand_map = {'a2 by aerosoles':'aerosoles', "what's what by aerosoles":'aerosoles',
'adidas outdoor':'adidas',
'adriana new york':'adriana',
'alexander mcqueen by puma':'puma', 'alexander mcqueen':'puma',
'alpine':'alpine swiss',
'anne klein ak':'anne klein', 'anne klein sport':'anne klein',
'annie shoes':'annie',
'athena':'athena alexander',
'babe.':'babe',
'baretraps':'bare traps',
'bcbg max azria':'bcbg', 'bcbg paris':'bcbg', 'bcbgeneration':'bcbg',
'beacon shoes':'beacon',
'bebe sport':'bebe', 'bebe stu':'bebe',
'belle by sigerson morrison':'belle', 'belle sigerson morrison':'belle',
'bernie mev':'bernie', 'bernie mev.':'bernie',
'bettye by bettye muller':'bettye muller', 'bettye by bettye muller ':'bettye muller',
"breckelle's":'breckelles',
'callaway footwear':'callaway',
'calvin klein ck':'calvin klein', 'calvin klein jeans':'calvin klein',
'carlos by carlos santana':'carlos santana',
'charles by charles david':'charels david',
'see by chloe':'chloe',
'clarks artisan':'clarks', 'clarks artisan collection':'clarks', 'clarks collection':'clarks',
'cobb hill by new balance':'new balance', 'cobb hill':'new balance',
'maria sharapova by cole haan': 'cole hann',
"corky's":'corkys', "corky's footwear":'corkys', "corkys footwear, inc.":'corkys',
'dearforms':'dearfoams', 'df by dearfoams':'dearfoams',
'derek lam 10 crosby':'derek lam',
'diba.true':'dibatrue',
'dolce&gabbana':'dolce and gabbana', 'dolce & gabbana':'dolce and gabbana', 'dolce by mojo moxy':'dolce and gabbana',
'dolce vita':'dolce and gabbana', 'dv8 by dolce vita':'dolce and gabbana', 'dv by dolce vita':'dolce and gabbana',
"dr. scholl's":'dr scholls',
"dr. martens air wair":'drmartens',
'drew shoe':'drew',
'easy spirit e360':'easy spirit', 'easy spirit.':'easy spirit',
'ellie shoes':'ellie',
'emu australia':'emu',
'fergie footwear':'fergie',
'forever collectible':'forever', 'forever link':'forever', 'fourever funky':'forever',
'sarto by franco sarto':'franco sarto',
'ferriniusa':'ferrini',
'fitflop':'fit flop',
'funtasma by pleaser':'funtasma',
'g by guess':'guess',
'gc shoes':'gc',
'genuine grip footwear':'genuine grip',
"hogan by tod's":'hogan',
'soft style by hush puppies':'hush puppies',
'ilse jacobsen hornbaek':'ilse jacobson',
'isaacmizrahi':'isaac mizrahi',
'italian shoe makers':'italian comfort',
'j.renee':'j. renee',
'jbu by jambu':'jambu',
'josefseibel':'josef siebel',
'justin blair':'justin', 'justin boots':'justin', 'justin gypsy':'justin',
'kate spade new york':'kate spade',
'kenneth cole reaction':'kenneth cole', 'kenneth cole ny':'kenneth cole', 'kenneth cole new york':'kenneth cole',
'unlisted kenneth cole':'kenneth cole',
'lamo sheepskin inc':'lamo',
'lifestride':'lifes tride',
'luoluo':'luo luo',
'marc fisher ltd':'marc fisher',
'mia heritage':'mia',
'micahel kors':'michael kors', 'michael michael kors':'michael kors',
'mobils by mephisto':'mephisto',
'top moda':'moda', 'moda essentials':'moda', 'everybody by bz moda':'moda',
'muk luks a la mode':'muk luks',
'munro american':'munro',
'naot footwear':'naot',
'new@titude':'new attitude', 'new@ttitude':'new attitude',
'nina originals':'nina',
'nine west vintage america collection':'nine west',
'nufoot���':'nufoot',
'pleaser shoes':'pleaser', 'pleaser usa, inc.':'pleaser', 'pleaserusa':'pleaser',
'rachel':'rachel roy', 'rachel rachel roy ':'rachel roy',
'lauren by ralph lauren':'ralph lauren', 'lauren ralph lauren':'ralph lauren', 'lauren lorraine':'ralph lauren',
'polo ralph lauren':'ralph lauren','ralph lauren denim supply':'ralph lauren',
'rieker-antistress':'rieker',
'rocket dog brands llc':'rocket dog',
'sanita clogs':'sanita',
'ferragamo':'salvatore ferragamo',
'skechers usa':'skechers',
'sperry top sider':'sperry', 'sperry top-sider':'sperry',
"l'artiste by spring step":'spring step', "flexus by spring step":'spring step', "patrizia by spring step ":'spring step',
"patrizia pepe":'spring step', "patrizia":'spring step',
'steven steve madden':'steve madden',
'style & co.':'style and co',
'timberland earthkeepers':'timberland', 'timberland pro':'timberland',
'toms shoes':'toms',
'tony lama boot co.':'tony lama',
'totes isotoner':'totes',
'trotter':'trotters',
'ugg australia':'ugg',
'famous name brand':'unbranded', 'generic':'unbranded', 'generic surplus':'unbranded', 'non-branded':'unbranded',
'not applicable':'unbranded', 'not rated':'unbranded', 'lucky brand':'unbranded', 'lucky brand':'unbranded',
'very fine dance shoes':'unbranded',
'valentino noir':'valentino',
'victoria k.':'victoria',
'vince camuto':'vince',
'vionic by orthaheel':'vionic', 'vionic with orthaheel technology':'vionic',
'elites by walking cradles':'walking cradles', 'elites':'walking cradles','mark lemp by walking cradles':'walking cradles',
'rose petals by walking cradles':'walking cradles', 'the walking cradle company':'walking cradles'
}
Reference this note book https://www.kaggle.com/ashishg21/data-cleaning-and-some-analysis-shoe-prices.
WomenShoe['brand'] = WomenShoe['brand'].str.lower()
# replace the brand names with the dictionary
WomenShoe['brand_clean'] = WomenShoe['brand'].replace(brand_map)
WomenShoe.brand_clean.value_counts()
WomenShoe.colors.value_counts()
color_rank = WomenShoe.colors.value_counts().head(10)
# append 10 colors with the highest count in to a list
rows = []
for i in range(10):
rows.append([color_rank.index[i], color_rank[i]])
# convert the list to a pandas dataframe
Col_Rank = pd.DataFrame(rows, columns=["color", "count"])
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(12, 5))
Col_Rank = Col_Rank.sort_values('count')
ax.barh(Col_Rank['color'], Col_Rank['count'], color=
['pink', 'green', 'silver', 'red', 'gray', 'white', 'blue', 'beige', 'brown', 'black'])
ax.set_ylabel("Color")
ax.set_xlabel("Count")
ax.set_title("Top 10 Color Count")
plt.show()
Black = WomenShoe[WomenShoe['colors'] == "Black"]
Brown = WomenShoe[WomenShoe['colors'] == "Brown"]
Beige = WomenShoe[WomenShoe['colors'] == "Beige"]
Blue = WomenShoe[WomenShoe['colors'] == "Blue"]
White = WomenShoe[WomenShoe['colors'] == "White"]
Gray = WomenShoe[WomenShoe['colors'] == "Gray"]
# Make a copy of dataframes for each color
Black1 = WomenShoe[WomenShoe['colors'] == "Black"]
Brown1 = WomenShoe[WomenShoe['colors'] == "Brown"]
Beige1 = WomenShoe[WomenShoe['colors'] == "Beige"]
Blue1 = WomenShoe[WomenShoe['colors'] == "Blue"]
White1 = WomenShoe[WomenShoe['colors'] == "White"]
Gray1 = WomenShoe[WomenShoe['colors'] == "Gray"]
def outlier(x):
y = (x['average_price'].quantile(0.75) - x.average_price.quantile(0.25)) * 1.5 + x.average_price.quantile(0.75)
return y
print(outlier(Black))
print(outlier(Brown))
print(outlier(Beige))
print(outlier(Blue))
print(outlier(White))
print(outlier(Gray))
Black.drop(Black[Black['average_price'] > 167].index, inplace = True)
#Remove the outliers for Brown shoes
Brown.drop(Brown[Brown['average_price'] > 205].index, inplace = True)
#Remove the outliers for Beige shoes
Beige.drop(Beige[Beige['average_price'] > 154].index, inplace = True)
#Remove the outliers for Blue shoes
Blue.drop(Blue[Blue['average_price'] > 121].index, inplace = True)
#Remove the outliers for White shoes
White.drop(White[White['average_price'] > 145].index, inplace = True)
#Remove the outliers for Beige shoes
Gray.drop(Gray[Gray['average_price'] > 191].index, inplace = True)
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(15, 7))
ax.boxplot([Black["average_price"], Brown["average_price"], Beige["average_price"],
Blue["average_price"], White["average_price"], Gray["average_price"]])
ax.set_xticklabels(["Black", "Brown", "Beige", "Blue", "White", "Gray"])
ax.set_ylabel("Price")
ax.set_xlabel("Color")
ax.set_title('Box Plot for the Top 6 Colors')
plt.show()
plt.style.use('ggplot')
fig, ax = plt.subplots(3, 2, figsize=(15, 8))
ax[0, 0].hist(Black1["average_price"], bins=100, range=(0,500), color = 'black')
ax[0,0].set_title('Black Shoes')
ax[0,0].set_ylabel('Frequency')
ax[0, 1].hist(Brown1["average_price"], bins=100, range=(0,500), color = 'Brown')
ax[0,1].set_title('Brown Shoes')
ax[1, 0].hist(Beige1["average_price"], bins=100, range=(0,500), color = 'Beige')
ax[1,0].set_title('Beige Shoes')
ax[1,0].set_ylabel('Frequency')
ax[1, 1].hist(Blue1["average_price"], bins=100, range=(0,500), color = 'Blue')
ax[1,1].set_title('Blue Shoes')
ax[2, 0].hist(White1["average_price"], bins=100, range=(0,500), color = 'white')
ax[2,0].set_title('White Shoes')
ax[2,0].set_xlabel('Price')
ax[2,0].set_ylabel('Frequency')
ax[2, 1].hist(Gray1["average_price"], bins=100, range=(0,500), color = 'gray')
ax[2,1].set_title('Gray Shoes')
ax[2,1].set_xlabel('Price')
plt.tight_layout()#Get rid of overlaps
plt.show()
#chi-squared distribution
y = WomenShoe.colors.value_counts().head(10)
Colors = []
for i in range (10):
Colors.append([y.index[i], y[i]])
# filter the origanial dataframe and keep all the rows that has those top ten colors
dfc = []
for i in range (len(Colors)):
dfc.append(Colors[i][0])
color_df = WomenShoe[WomenShoe['colors'].isin(dfc)]
color_df.groupby('colors')['average_price'].mean().sort_values(ascending=False)
x = WomenShoe.brand_clean.value_counts()
brands = []
for i in range (1897):
if x[i] > 5:
brands.append([x.index[i], x[i]])
#filter the origanial dataframe and keep all the brands that have at least 5 rows in the dataset
df = []
for i in range (len(brands)):
df.append(brands[i][0])
WS_Brand = WomenShoe[WomenShoe['brand_clean'].isin(df)]
#Calculate the average price for each brand and only keep the top 20 brands
df_brand = WS_Brand.groupby('brand_clean')['average_price'].mean().sort_values(ascending=False).head(20)
Over5 = []
for i in range (len(df_brand)):
Over5.append([df_brand.index[i], df_brand[i]])
Over5_df = pd.DataFrame(Over5, columns=["brand", "average_price"])
plt.style.use('ggplot')
fig, ax = plt.subplots(figsize=(12, 10))
Over5_df = Over5_df.sort_values('average_price')
ax.barh(Over5_df['brand'], Over5_df['average_price'], color= "blue")
ax.set_ylabel("Brands")
ax.set_xlabel("Average Price")
ax.set_title("Top 20 brands with Count over 5")
plt.show()
brands = []
for i in range (1897):
if x[i] > 50:
brands.append([x.index[i], x[i]])
#filter the origanial dataframe and keep all the brands that have at least 50 rows in the dataset
df = []
for i in range (len(brands)):
df.append(brands[i][0])
WS_Brand = WomenShoe[WomenShoe['brand_clean'].isin(df)]
#Calculate the average price for each brand and only keep the top 20 brand
df_brand = WS_Brand.groupby('brand_clean')['average_price'].mean().sort_values(ascending=False).head(20)
Over5 = []
for i in range (len(df_brand)):
Over5.append([df_brand.index[i], df_brand[i]])
Over5_df = pd.DataFrame(Over5, columns=["brand", "average_price"])
#draw a bar chart for 20 brands with the highest average price
plt.style.use('ggplot')
my_cmap = plt.get_cmap("viridis")
fig, ax = plt.subplots(figsize=(12, 10))
Over5_df = Over5_df.sort_values('average_price')
ax.barh(Over5_df['brand'], Over5_df['average_price'], color= my_cmap.colors )
ax.set_ylabel("Brands")
ax.set_xlabel("Average Price")
ax.set_title("Top 20 brands with Count over 50")
plt.show()
plt.style.use('ggplot')
fig, ax = plt.subplots()
ax.hist(WomenShoe["average_price"], bins=50, range=(0,800), color = 'orange')
ax.set_title('Price Distribution Across Brands')
ax.set_xlabel("Price")
ax.set_ylabel("Frequency")
plt.show()
Ralph_lauren = WomenShoe[WomenShoe['brand_clean'] == "ralph lauren"]
Ugg = WomenShoe[WomenShoe['brand_clean'] == "ugg"]
Nike = WomenShoe[WomenShoe['brand_clean'] == "nike"]
Puma = WomenShoe[WomenShoe['brand_clean'] == "puma"]
Toms = WomenShoe[WomenShoe['brand_clean'] == "toms"]
Vans = WomenShoe[WomenShoe['brand_clean'] == "vans"]
plt.style.use('ggplot')
fig, ax = plt.subplots(3, 2, figsize=(15, 8))
ax[0, 0].hist(Ralph_lauren["average_price"], bins=50, range=(0,1500), color = 'cadetblue')
ax[0,0].set_title('Ralph Lauren')
ax[0,0].set_ylabel("Frequency")
ax[0, 1].hist(Ugg["average_price"], bins=50, range=(0,500), color = 'lightblue')
ax[0,1].set_title('Ugg')
ax[1, 0].hist(Nike["average_price"], bins=50, range=(0,500), color = 'skyblue')
ax[1,0].set_title('Nike')
ax[1,0].set_ylabel("Frequency")
ax[1, 1].hist(Puma["average_price"], bins=50, range=(0,500), color = 'steelblue')
ax[1,1].set_title('Puma')
ax[2, 0].hist(Toms["average_price"], bins=50, range=(0,500), color = 'dodgerblue')
ax[2,0].set_title('Toms')
ax[2,0].set_ylabel("Frequency")
ax[2,0].set_xlabel("Average Price")
ax[2, 1].hist(Vans["average_price"], bins=50, range=(0,500), color = 'deepskyblue')
ax[2,1].set_title('Vans')
ax[2,1].set_xlabel("Average Price")
plt.tight_layout()#Get rid of overlaps
plt.show()
Ralph_lauren['average_price'].plot(kind='density', color = 'cadetblue')
plt.title('Price Distribution for Ralph Lauren')
plt.xlabel('Price')
plt.show()
Ugg['average_price'].plot(kind='density', color = 'lightblue')
plt.title('Price Distribution for Ugg')
plt.xlabel('Price')
plt.show()
Nike['average_price'].plot(kind='density', color = 'skyblue')
plt.title('Price Distribution for Nike')
plt.xlabel('Price')
plt.show()
Puma['average_price'].plot(kind='density', color = 'steelblue')
plt.title('Price Distribution for Puma')
plt.xlabel('Price')
plt.show()
Toms['average_price'].plot(kind='density', color = 'dodgerblue')
plt.title('Price Distribution for Toms')
plt.xlabel('Price')
plt.show()
Vans['average_price'].plot(kind='density', color = 'deepskyblue')
plt.title('Price Distribution for Vans')
plt.xlabel('Price')
plt.show()
Overall, I explored the price distribution for shoe colors and shoe brands. The price distribution for shoe colors are more like a chi-squared distribution. The price distribution for shoe brands can be close to normal distribution.