import pandas as pd
import altair as alt
import numpy as np
import re
# https://www.ers.usda.gov/data-products/dairy-data/
df_dairy = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48685/quarterlymilkfactors_1_.xlsx", engine='openpyxl', skiprows=4)
df_dairy.columns = ["YearAndQuarter", "Year", "MilkCows", "productionLbsMilkPerCow", "productionMillionLbs",
"NA", "feedDollarPerCwt", "replacementPricePerHead"]
#df_dairy = df_dairy.iloc[:, :-5]
findYearLabels = lambda v: v if isinstance(v, int) or (isinstance(v, str) and re.match("^\d+$", v) is not None) else None
# Shift the year label over one column and forward fill it
df_dairy['Year'] = df_dairy['YearAndQuarter'].map(findYearLabels)
df_dairy['Year'] = df_dairy['Year'].ffill()
# Subset to only quarterly rows
df_dairy = df_dairy[~df_dairy['YearAndQuarter'].str.contains("^(?:\d|Avg)", regex=True, na=True)]
# Parse out the datetime value
df_dairy['dt'] = pd.to_datetime(df_dairy['YearAndQuarter'].map(lambda v: re.sub("[^A-z]", "", v.split("-")[-1])) + df_dairy['Year'].apply(int).apply(str),
format="%b%Y", errors='coerce')
# Drop all rows with invalid timestemp and the NA column
df_dairy = df_dairy[~pd.isna(df_dairy['dt'])] #.drop('NA', axis=1)
#df_dairy.tail(10)
alt.Chart(df_dairy).mark_line().encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('feedDollarPerCwt:Q', axis=alt.Axis(title='Value, 16% protein [$/cwt]'))
).properties(
title='US Dairy Cow Feed Value (per USDA)',
width=750,
height=450
)
def yoyCowChartFor(df, metric, label, units=''):
return alt.Chart(df).mark_bar().transform_window(
window=[alt.WindowFieldDef("lag", metric, param=12, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).transform_filter(
"datum.yoy != -100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('yoy:Q', axis=alt.Axis(title=f'{label} [Year-over-year % change]')),
color=alt.condition(f"datum.yoy < 0",
alt.value('lightsalmon'),
alt.value('royalblue')
),
tooltip=[alt.Tooltip('dt:T', title='Date'), alt.Tooltip('yoy:Q', title='% Change', format=',.1f')]
).properties(
title=f'US {label} (per USDA)',
width=750,
height=450
)
yoyCowChartFor(df_dairy, metric='feedDollarPerCwt', label="Dairy Cow Feed Value", units='$/cwt')
alt.Chart(df_dairy).mark_line().transform_calculate(
cows='datum.MilkCows < 10 ? NaN : datum.MilkCows * 1000'
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('cows:Q', axis=alt.Axis(title='Cows'))
).properties(
title='US Dairy Cows (per USDA)',
width=750,
height=450
)
yoyCowChartFor(df_dairy, metric='MilkCows', label="Dairy Cow Population", units='cows')
c = alt.Chart(df_dairy).mark_line().encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('productionMillionLbs:Q', axis=alt.Axis(title='Milk [million-lbs]'))
).properties(
title='US Dairy Cow Production (per USDA)',
width=750,
height=450
)
c.save('dairy.png')
c.display()
yoyCowChartFor(df_dairy, metric='productionMillionLbs', label="Dairy Cow Milk Production", units='million-lbs')
df_dairy_products_raw = pd.read_excel("https://www.ers.usda.gov/webdocs/DataFiles/48685/CmDsProd.xlsx", engine='openpyxl',
sheet_name=None)
# Subset the workbook for only the monthly data
df_dairy_products_dict = { k: s for k, s in df_dairy_products_raw.items() if "monthly" in k }
butterCols = ['Year', 'Month', 'Beginning-commercial-stocks', 'Production',
'Imports', 'Total supply', "USDA-net-removals-barters-CCC-donations",
'Domestic-commercial-disappearance', 'Commercial-exports',
'Total-commercial-disappearance', 'Ending-commercial-stocks', 'NA']
try:
df_dairy_products_dict['Butter, monthly'].columns = butterCols
except ValueError:
df_dairy_products_dict['Butter, monthly'].columns = butterCols[:-1]
df_dairy_products_dict['Butter, monthly']['Year'] = df_dairy_products_dict['Butter, monthly']['Year'].ffill()
#df_dairy_products_dict['Butter, monthly'].iloc[2:].head(15)
#df_dairy_products_dict['Butter, monthly'].head(15)
alt.Chart(df_dairy_products_dict['Butter, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Production:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Butter Production (per USDA)',
width=750,
height=450
)
skimMilkCols = ["Year", "Month", 'Supply-Nonfat-dry-milk-NDM',
'Supply-Dry-skim-milk-for-animal-use', 'Supply-Total', 'Production-NDM',
'Production-Skim milk powder', 'Production-Dry-skim-milk-for-animal-use',
'Production-Total', "Imports", "Total-supply", "USDA-net-removals-barters-CCC-donations",
'Commercial-Domestic-disappearance', 'Commercial-exports', 'Commercial-Total-disappearance',
'Ending-stocks-NDM', 'Ending-stocks-Dry-skim-milk-for-animal-use', 'Ending-stocks-Total']
df_dairy_products_dict['Dry skim milk, monthly'].columns = skimMilkCols
df_dairy_products_dict['Dry skim milk, monthly'] = df_dairy_products_dict['Dry skim milk, monthly'].iloc[3:].copy()
df_dairy_products_dict['Dry skim milk, monthly']['Year'] = df_dairy_products_dict['Dry skim milk, monthly']['Year'].ffill()
#df_dairy_products_dict['Dry skim milk, monthly'].head(10)
alt.Chart(df_dairy_products_dict['Dry skim milk, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Production-Total:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Dry Skim Milk Production (per USDA)',
width=750,
height=450
)
amCheeseCols = ["Year", "Month", "Beginning-commercial-stocks", "Production", "Imports", "Total-supply",
"USDA-net-removals-barters-CCC-donations", "Domestic-commercial-disappearance",
"Commercial-exports", "commercial-disappearance", "Ending commercial stocks"]
df_dairy_products_dict['Amer cheese, monthly'].columns = amCheeseCols
df_dairy_products_dict['Amer cheese, monthly'] = df_dairy_products_dict['Amer cheese, monthly'].iloc[2:].copy()
df_dairy_products_dict['Amer cheese, monthly']['Year'] = df_dairy_products_dict['Amer cheese, monthly']['Year'].ffill()
#df_dairy_products_dict['Amer cheese, monthly'].head(10)
alt.Chart(df_dairy_products_dict['Amer cheese, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Production:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US American Cheese Production (per USDA)',
width=750,
height=450
)
otherCheeseCols = ["Year", "Month", "Beginning-commercial-stocks", "Production", "Imports", "Total-supply",
"USDA-net-removals-barters-CCC-donations", "Domestic-commercial-disappearance",
"Commercial-exports", "Total-commercial-disappearance", "Ending-commercial-stocks"]
df_dairy_products_dict['Other cheese, monthly'].columns = otherCheeseCols
df_dairy_products_dict['Other cheese, monthly'] = df_dairy_products_dict['Other cheese, monthly'].iloc[2:].copy()
df_dairy_products_dict['Other cheese, monthly']['Year'] = df_dairy_products_dict['Other cheese, monthly']['Year'].ffill()
#df_dairy_products_dict['Other cheese, monthly'].head(10)
alt.Chart(df_dairy_products_dict['Other cheese, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Production:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Other Cheese Production (per USDA)',
width=750,
height=450
)
market = df_dairy_products_dict['Dry whey, monthly'].iloc[0].ffill()
category = df_dairy_products_dict['Dry whey, monthly'].iloc[1].ffill(limit=2)
usage = df_dairy_products_dict['Dry whey, monthly'].iloc[2]
noNans = lambda v: filter(lambda s: isinstance(s, str) and s != "nan", list(v))
onlyChars = lambda v: re.sub("[^A-z ]", "", str(v).strip())
wheyCols = ["-".join(map(onlyChars, noNans(c))).replace(" -", "-").replace(" ", "-") for c in zip(market, category, usage)]
df_dairy_products_dict['Dry whey, monthly'].columns = wheyCols
df_dairy_products_dict['Dry whey, monthly'] = df_dairy_products_dict['Dry whey, monthly'].iloc[3:310].copy()
df_dairy_products_dict['Dry whey, monthly']['Year'] = df_dairy_products_dict['Dry whey, monthly']['Year'].ffill()
#df_dairy_products_dict['Dry whey, monthly']
alt.Chart(df_dairy_products_dict['Dry whey, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Supply-Production-Total:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Whey Production (per USDA)',
width=750,
height=450
)
df_dairy_products_dict['WPC, monthly'].columns = wheyCols
df_dairy_products_dict['WPC, monthly'] = df_dairy_products_dict['WPC, monthly'].iloc[3:310].copy()
df_dairy_products_dict['WPC, monthly']['Year'] = df_dairy_products_dict['WPC, monthly']['Year'].ffill()
#df_dairy_products_dict['WPC, monthly']
alt.Chart(df_dairy_products_dict['WPC, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Supply-Production-Total:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Whey Protein Concentrate Production (per USDA)',
width=750,
height=450
)
#'Lactose, monthly'
lactoseCols = ["Year", "Month", "Beginning manufacturers' stocks", "Production", "Imports", "Total-supply",
"Domestic-commercial-disappearance", "Commercial-exports", "Total-commercial-disappearance",
"Ending-manufacturers-stocks", "NA1", "NA2", "NA3"]
try:
df_dairy_products_dict['Lactose, monthly'].columns = lactoseCols
except ValueError:
df_dairy_products_dict['Lactose, monthly'].columns = lactoseCols[:-3]
df_dairy_products_dict['Lactose, monthly'] = df_dairy_products_dict['Lactose, monthly'].iloc[2:309].copy()
df_dairy_products_dict['Lactose, monthly'].loc[:, 'Year'] = df_dairy_products_dict['Lactose, monthly']['Year'].ffill()
#df_dairy_products_dict['Lactose, monthly']
alt.Chart(df_dairy_products_dict['Lactose, monthly']).mark_line().transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('Production:Q', axis=alt.Axis(title='Production [million-lbs]'))
).properties(
title='US Lactose Production (per USDA)',
width=750,
height=450
)
def yoyChartFor(df, sheet, metric, label='Lactose Production'):
return alt.Chart(df[sheet]).mark_bar(size=2).transform_calculate(
dt="timeParse(datum.Year + ' ' + datum.Month, '%Y %b')",
).transform_window(
window=[alt.WindowFieldDef("lag", metric, param=12, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('yoy:Q', axis=alt.Axis(title=f'{label} [Year-over-year % change]')),
color=alt.condition(f"datum.yoy < 0",
alt.value('lightsalmon'),
alt.value('royalblue')
),
tooltip=[alt.Tooltip('yoy:Q', title='% Change', format=',.1f')]
).properties(
title=f'US {label} (per USDA)',
width=750,
height=450
)
yoyChartFor(df_dairy_products_dict, 'Lactose, monthly', 'Production', label='Lactose Production')
yoyChartFor(df_dairy_products_dict, 'WPC, monthly', 'Supply-Production-Total', label='WPC Production')
yoyChartFor(df_dairy_products_dict, 'Dry whey, monthly', 'Supply-Production-Total', label='Dry whey Production')
yoyChartFor(df_dairy_products_dict, 'Other cheese, monthly', 'Production', label='Other cheese Production')
yoyChartFor(df_dairy_products_dict, 'Amer cheese, monthly', 'Production', label='American cheese Production')
yoyChartFor(df_dairy_products_dict, 'Dry skim milk, monthly', 'Production-Total', label='Dry Skim Milk Production')
yoyChartFor(df_dairy_products_dict, 'Butter, monthly', 'Production', label='Butter Production')