#data cleaning script
import pandas as pd
= pd.read_csv('fareway_shenandoah_bacon.csv',index_col=False)
df
df= pd.read_csv('iowa_zip_lookup.csv',index_col=False)
zip_codes_list
zip_codes_list= zip_codes_list.drop('County',axis=1)
zip_codes_list = df.drop(df.columns[0], axis=1)
df =df.reset_index(drop=True)
df= df.drop(df.columns[5], axis=1)
df = df.drop(df.columns[4], axis=1)
df =df.rename(columns={"au-target": "Product Name", "au-target 2": "Original Price", "product-price": "Selling Price", "pc-amount": "Weight in lb"})
df'Weight in lb']=df['Weight in lb'].fillna('1 lb')
df[
df'Weight in lb']=df['Weight in lb'].replace('16 oz', '1 lb', regex=True)
df['Weight in lb']=df['Weight in lb'].replace(' Box', '', regex=True)
df[#df['Weight in lb']=df['Weight in lb'].replace(' lb', '', regex=True)
'Weight in lb']=df['Weight in lb'].replace('Approx ', '', regex=True)
df['Weight in lb']=df['Weight in lb'].replace(' LB Each', '', regex=True)
df['Selling Price']=df['Selling Price'].replace('/LB', '', regex=True)
df[def convert_to_pounds(weight):
if isinstance(weight, str):
if 'oz' in weight:
= float(weight.strip(' oz'))
ounces = ounces / 16
pounds return f"{pounds} lb"
elif 'lb' in weight:
return weight
return weight
'Weight in lb'] = df['Weight in lb'].apply(convert_to_pounds)
df['Zip Code'] = '51601'
df[
dfdef convert_to_price_per_pound(row):
= float(row['Selling Price'].strip('$'))
price_value = row['Weight in lb'].split(' ')[0]
weight_str = float(weight_str)
weight_value = price_value / weight_value
price_per_pound return pd.Series([price_value, weight_value, price_per_pound], index=['Weight in lb', 'Weight (lb)', 'Price per Pound ($)'])
'Price ($)', 'Weight (lb)', 'Price per Pound ($)']] = df.apply(convert_to_price_per_pound, axis=1)
df[[
dffrom datetime import datetime
# Create a sample DataFrame
# Generate the current date and time
= datetime.now().strftime('%Y%m%d_%H%M%S')
current_datetime
# Define the file name with the current date and time
= f'fareway_shenandoah_cleaned_{current_datetime}.csv'
file_name
# Output the DataFrame to the CSV file
=False) df.to_csv(file_name, index
Week Four Highlights
Monday- Collecting data for the project and working on finding data sources for Heirloom Tomatoes.
Tuesday- Collected in person data at Slater to analyze the housing conditions and amenities there in the first half of the day. Second half was remote where I worked on compiling and cleaning the data.
Wednesday- Created a script which cleaned scraped prices from Fareway Market website for locations Ames, Fort Dodge, Davenport, Iowa City, New Hampton, Clear Lake, Sioux City, Shenandoah in Iowa.
Thursday- Worked to get more data output, discussed presentation and blog wrap ups with team.