A look at railroad freight volumes, according to reports obtained from BNSF, CSX, Norfolk Southern, and Union Pacific.
import pandas as pd
import altair as alt
import numpy as np
import re
from joblib import Memory
from time import sleep
from datetime import date
from IPython.core.display import HTML
from tabula import read_pdf
from tika import parser
from urllib.request import urlopen, Request
from urllib.parse import urlencode
from urllib.error import URLError
from http.client import InvalidURL
from json import dumps, loads
# need {unidecode, fake_useragent}
#from googleapi.google import search as googleSearch
from urllib.error import HTTPError
from io import BytesIO
from os import environ
memory = Memory('data/', verbose=0)
uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
if environ.get("GITHUB_WORKFLOW"):
raise RuntimeError("Requires manual updates")
currentYear = date.today().year
%%capture
%%time
@memory.cache
def parseCsxReport(urlAndWeek):
try:
url, weekAndYear = urlAndWeek
# '2013 Week 29'
year, week = weekAndYear.split(' Week ')
except ValueError:
try:
url, weekAndYear = urlAndWeek
# '2013 Week 29'
year, week = weekAndYear.split('-WK')
except ValueError:
print(urlAndWeek)
return None
# WEEK 28 QUARTER TO DATE YEAR TO DATE
baseCols = ['Cargo', 'Week {0} {1}', 'Week {0} {2}', 'Week {0} pct-change',
'Week {0} {1} QTD', 'Week {0} {2} QTD', 'Week {0} QTD pct-change',
'Week {0} {1} YTD', 'Week {0} {2} YTD', 'Week {0} YTD pct-change']
try:
cols = [c.format(week.split(' ')[0], int(year), int(year)-1) for c in baseCols]
except ValueError:
print(urlAndWeek)
return None
try:
tmp = read_pdf(url, pages='all', pandas_options={ 'names': cols })
df = pd.concat(tmp) if isinstance(tmp, list) else tmp
return df.iloc[1:, :].set_index('Cargo').applymap(lambda v: float(re.sub('[^0-9.]', '', v)))
except AttributeError:
print(urlAndWeek)
return None
except ValueError:
print(urlAndWeek)
return None
except HTTPError:
print(urlAndWeek)
return None
#parseReport(('http://csx.gcs-web.com/static-files/f2baf981-9658-41f8-9ec0-400e9fdbde11', '2013 Week 29')).head(30)
#parseReport(('http://csx.gcs-web.com/static-files/2d4f737e-1314-46aa-9e8c-a29b40be2a2b', '2019 Week 49 AAR')).head(30)
def getWeekFrom(u):
return re.findall("(?:Week-|WK|Week_|w|week)(\d\d?)", u)[0]
#getWeekFrom('https://s2.q4cdn.com/859568992/files/doc_downloads/volume_trends/2014/CSX_AAR-2014-WK20_csx.pdf')
def getUrlsAndWeeks(res):
urls = []
for item in res['GetContentAssetListResult']:
u = item['FilePath']
if "2020" in u:
year = 2020
elif "2021" in u:
year = 2021
elif "2022" in u:
year = 2022
else:
year = u.split("/")[-2]
try:
week = getWeekFrom(u)
yearAndWeek = f"{year} Week {week}"
urls.append((u, yearAndWeek))
except IndexError:
continue
return urls
reportListUrl = "https://investors.csx.com/Services/ContentAssetService.svc/GetContentAssetList"
params = lambda y: {
"serviceDto": {
"ViewType": "2",
"ViewDate": "",
"RevisionNumber": "1",
"LanguageId": "1",
"Signature": "",
"ItemCount": -1,
"StartIndex": 0,
"TagList": [],
"IncludeTags": True
},
"assetType": "Volume Trends",
"excludeSelection": 1,
"year": y
}
headers = {
'User-Agent': uaString,
'Content-Type': 'application/json; charset=UTF-8'
}
urls = []
for y in range(2014, currentYear + 1):
req = Request(reportListUrl, data=dumps(params(y)).encode('ascii'), headers=headers)
data = urlopen(req)
res_dict = loads(data.read())
urls = urls + getUrlsAndWeeks(res_dict)
dfs = map(parseCsxReport, urls)
merged = pd.concat(dfs, axis=1, sort=False).T
#merged.head()
subset = merged.filter(axis='index', regex='[0-9]$').copy()
subset['dt'] = subset.index.map(lambda v: pd.to_datetime(v.replace(' AAR', '') + ' 6', format='Week %W %Y %w', errors='coerce'))
HTML("<h4><i>(data current as of {})</i></h4>".format(date.today().strftime("%B %d, %Y")))
def doChart(metric, df, color='grey', size=2, resample='1M', carrier='CSX Freight', domain=[-100, 100]):
pmp = df[['dt', metric]].sort_values('dt').copy()
pmp2 = pmp.set_index('dt').resample(resample).mean().dropna().reset_index() if resample else pmp
brush = alt.selection(type='interval', encodings=['x'])
base = alt.Chart(pmp2).mark_line(color='purple').encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y')),
alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]')),
tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"{metric}:Q", format=",.02f")]
).properties(
width=750,
height=450
)
upper = base.mark_bar(size=size, color=color).transform_window(
window=[alt.WindowFieldDef("lag", metric, param=12, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
alt.Y('yoy:Q', axis=alt.Axis(title='52-Week Growth [%]'), scale=alt.Scale(domain=domain)),
color=alt.condition("datum['yoy'] < 0",
alt.value('lightsalmon'),
alt.value(color)
),
tooltip=[alt.Tooltip('dt:T', format='%B %Y', title='Period'), alt.Tooltip('yoy:Q', format=',.02f')]
).properties(
title=f'{carrier} Reports: {metric}'
)
lower = base.properties(
height=100
).add_selection(brush)
return (upper & lower).properties(
background='white'
)
#return c.display()
def doYoyChart(df, metric = 'MOTOR VEHICLES', carrier="BNSF", period=52, freq='W', subset=-260, size=2, color='royalblue'):
#yoy = df.set_index('dt').pct_change(period, freq=freq).apply(lambda v: v * 100).sort_index().reset_index().copy()
# .transform_calculate(
# val=f"datum['{metric}'] > 100 ? NaN : datum['{metric}']"
#)
brush = alt.selection(type='interval', encodings=['x'])
base = alt.Chart(df[subset:]).mark_line(color='purple').encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y')),
alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]')),
tooltip=[
alt.Tooltip("dt:T", format="%b %Y"),
alt.Tooltip(f"{metric}:Q")
]
).properties(
width=750,
height=450
)
upper = base.mark_bar(size=size, color=color).transform_window(
window=[alt.WindowFieldDef("lag", metric, param=period, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
alt.Y('yoy:Q', axis=alt.Axis(title='Year-over-year Growth [%]'), impute=alt.ImputeParams(method='value', keyvals=[100, 1000])),
color=alt.condition(f"datum['yoy'] < 0",
alt.value('lightsalmon'),
alt.value(color)
),
tooltip=[
alt.Tooltip('dt:T', format='%b %Y', title=''),
alt.Tooltip(f'yoy:Q', format=',.0f', title='Year-over-year Growth [%]')
]
).properties(
title=f"{carrier} trend for: {metric}"
)
lower = base.properties(
height=100
).add_selection(brush)
return (lower & upper).properties(
background='white'
)
metric = 'Total Traffic'
doChart(metric, subset, color='royalblue', size=4, domain=[-30, 40])
metric = 'Total Carloads'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
metric = 'Total Intermodal'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
doChart('Primary Metal Products', subset, color='darkslategray', size=4, domain=[-40, 80])
metric = 'Lumber & Wood Products'
doChart(metric, subset, color='darkolivegreen', size=4, domain=[-40, 40])
metric = 'Motor Vehicles and Parts'
doChart(metric, subset, color='darkslategray', size=4, domain=[-100, 100])
metric = 'Containers'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
metric = 'Coal'
doChart(metric, subset, color='black', size=4, domain=[-60, 80])
metric = 'Chemicals'
doChart(metric, subset, color='darkseagreen', size=4, domain=[-30, 30])
metric = 'Food Products'
doChart(metric, subset, color='darkolivegreen', size=4, domain=[-40, 40])
metric = 'Grain'
doChart(metric, subset, color='darkseagreen', size=4, domain=[-40, 60])
metric = 'Grain Mill Products'
doChart(metric, subset, color='teal', size=4, domain=[-20, 20])
# https://www.up.com/investor/aar-stb_reports/
from subprocess import CalledProcessError
upUrls = []
for y in range(2005, 2015):
url = "'http://www.up.com/investors/reports/archive.cfm?Year={}'".format(y)
res = !curl -L $url
pdfs = re.findall('href="([^.]+.pdf)"', "".join(res))
upUrls = upUrls + pdfs
@memory.cache
def parseUnionPacificArchive(u, retries=3):
"""
Union Pacific Parser
"""
if retries < 0:
return None
theUrl = "http://up.com" + u
try:
week = re.search(".*[0-9]{4}/(\d\d).pdf.*", u).group(1)
except AttributeError:
try:
week = re.search(".*[0-9]{4}/week(\d\d)\_\d\d.pdf.*", u).group(1)
except AttributeError:
print("Failed: " + u)
return None
#print(week)
try:
req = Request(theUrl, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
df_up = read_pdf(BytesIO(data.read()), pages='all', multiple_tables=False)[0]
originalColumns = df_up.columns.to_list()
df_up.columns = [str(originalColumns[1]) + '-' + str(week)] + originalColumns[1:]
year = str(df_up.columns[1])
if year[:2] in ['5,', '6,', '7,', '8,']:
print(f"Invalid year for {u}")
print(originalColumns)
return None
if year == '2009':
# fetch 2008 data from the 2009 previous year column
tmp = pd.concat([df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(year + '-' + str(week)),
df_up.set_index(df_up.columns[0])[df_up.columns[2]].rename("2008" + '-' + str(week))], axis=1)
else:
tmp = df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(year + '-' + str(week))
return tmp
except CalledProcessError:
return None
except KeyError:
print("Failed: " + u)
sleep(2 ** (3 - retries))
return parseUnionPacificArchive(theUrl, retries-1)
except IndexError:
print("Failed: " + u)
sleep(2 ** (3 - retries))
return parseUnionPacificArchive(theUrl, retries-1)
except URLError:
print("Failed: " + u)
sleep(2 ** (3 - retries))
return parseUnionPacificArchive(theUrl, retries-1)
#parseArchive(upUrls[1]).head()
%%capture
#%%time
dfs_a = []
for u in upUrls:
if "changes" in u:
continue
dfs_a.append(parseUnionPacificArchive(u))
df_aone = pd.concat(dfs_a, axis=1).T
validCols = [c for c in df_aone.columns if c]
df_aone[validCols] = df_aone[validCols].applymap(lambda v: str(v).replace(",", "") if not pd.isnull(v) else v)\
.applymap(lambda v: pd.to_numeric(v, errors='coerce'))
cleanColumn = lambda s: re.sub("^\d+ ", "", str(s)).replace(", N.E.C", "").replace("Prod.", "Products")
df_aone.columns = [cleanColumn(c) for c in df_aone.columns]
#df_aone.columns
tab = df_aone.T.reset_index().melt(id_vars='index').groupby(['index', 'variable']).agg('mean')\
.reset_index().pivot(index='variable', columns='index')
tab = tab.droplevel(level=0, axis=1)
tab.index = tab.index.rename('dt')
#tab.head()
%%capture
#%%time
# curl -L 'https://www.up.com/up/investor/aar-stb_reports/2015_carloadings/index.htm'
@memory.cache
def getUrsl2():
upUrls2 = []
for y in range(2015, currentYear):
url = "'https://www.up.com/up/investor/aar-stb_reports/{}_carloadings/index.htm'".format(y)
if y == 2021:
url = "'https://www.up.com/up/investor/aar-stb_reports/2021_Carloads/index.htm'"
res = !curl -L $url
pdfs = re.findall('href="([^.]+.pdf)"', "".join(res))
upUrls2 = upUrls2 + pdfs
return upUrls2
# For current year
new_url = "https://www.up.com/up/investor/aar-stb_reports/2022_carloads/index.htm"
new_res = !curl -L $new_url
new_pdfs = re.findall('href="([^.]+.pdf)"', "".join(new_res))
upUrls2 = getUrsl2() + new_pdfs
@memory.cache
def parseUnionPacificArchive2(url, retries=3):
"""
Union Pacific Parser 2
"""
if retries < 0:
return None
theUrl2 = "http://up.com" + url
try:
week = re.search(".*_[\d]{4}-(\d\d?).pdf.*", url).group(1)
except AttributeError:
try:
week = re.search(".*week_(\d\d?)\_[\d]{4}[^.]+.pdf.*", url).group(1)
except AttributeError:
try:
week = re.search(".*_[\d]{4}_(\d\d?)[^\.]*.pdf.*", url).group(1)
except AttributeError:
print("Failed week: " + url)
return None
#print(week)
try:
req = Request(theUrl2, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
df_up = read_pdf(BytesIO(data.read()), pages='all', multiple_tables=False)[0]
df_up.columns = [str(df_up.columns[1]) + '-' + str(week)] + df_up.columns.to_list()[1:]
tmp = df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(str(df_up.columns[1]) + '-' + str(week))
return tmp
except IndexError:
print("Failed index: " + url)
sleep(2 ** (3 - retries))
return parseUnionPacificArchive2(url, retries-1)
except HTTPError:
print("Failed url: " + url)
sleep(2 ** (3 - retries))
return parseUnionPacificArchive2(url, retries-1)
except InvalidURL:
print("Failed url: " + url)
return None
# parseArchive2(upUrls2[2]).head()
dfs_atwo = []
for u in upUrls2:
if "changes" in u:
continue
dfs_atwo.append(parseUnionPacificArchive2(u))
df_atwo = pd.concat(dfs_atwo, axis=1).T# .head()
validCols = [c for c in df_atwo.columns if c]
df_atwo[validCols] = df_atwo[validCols].applymap(lambda v: str(v).replace(",", "") if not pd.isnull(v) else v)\
.applymap(lambda v: pd.to_numeric(v, errors='coerce'))
#df_atwo.columns
df_unionpac = pd.concat([tab, df_atwo])
df_unionpac['dt'] = df_unionpac.index.map(lambda v: pd.to_datetime(v+"-6", format="%Y-%W-%w", errors='coerce'))
#df_unionpac.head()
df_unionpac_yoy = df_unionpac.groupby('dt').agg('mean').resample('1W').nearest().pct_change(52, freq='W').apply(lambda v: v * 100).reset_index()
#df_unionpac_yoy.tail()
def doUpChart(df, metric='Total Carloads', color='purple'):
return alt.Chart(df_unionpac.reset_index()[['dt', metric]]).mark_line(color=color).encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y('{}:Q'.format(metric))
).properties(
title=f"Union Pacific Railroad: {metric} History",
width=750,
height=400
)
doUpChart(df_unionpac, 'Total Carloads')
def doYoyUpChart(df, metric, color='royalblue', domain=[-30, 30]):
brush = alt.selection(type='interval', encodings=['x'])
base = alt.Chart(df).mark_line(color='purple').encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), impute=alt.ImputeParams(value=None)),
alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]'), impute=alt.ImputeParams(value=None)),
tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"{metric}:Q", format=",.02f")]
).properties(
width=750,
height=450
)
upper = base.mark_bar(size=1.5, color=color).transform_window(
window=[alt.WindowFieldDef("lag", metric, param=52, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
alt.Y('yoy:Q', axis=alt.Axis(title='Year-over-year Growth [%]'), scale=alt.Scale(domain=domain)),
color=alt.condition(f"datum['yoy'] < 0",
alt.value('lightsalmon'),
alt.value(color)
),
tooltip=[alt.Tooltip('dt:T', format='%b %Y'), alt.Tooltip(f'yoy:Q', format=',.0f')]
).properties(
title=f"Union Pacific Railroad: {metric} Traffic"
)
lower = base.properties(
height=100
).add_selection(brush)
return (upper & lower).properties(
background='white'
)
c = doYoyUpChart(df_unionpac, 'Total Carloads')
c.save('transportation-rail.png')
c.display()
metric = 'Chemicals'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='darkseagreen')
metric = 'Lumber & Wood Products'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='forestgreen', domain=[-60, 60])
metric = 'Iron & Steel Scrap'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='slategray', domain=[-80, 80])
metric = 'Metals & Products'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='slategray', domain=[-80, 120])
metric = 'Motor Vehicles & Equipment'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='black', domain=[-85, 125])
metric = 'Metallic Ores'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='darkslategray', domain=[-85, 170])
metric = 'Nonmetallic Minerals'
#doUpChart(df_unionpac, metric)
doYoyUpChart(df_unionpac, metric, color='gray', domain=[-65, 100])
nsBase = "https://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/{0}/investor-weekly-carloads-{1}-{0}.pdf"
nsDates = [d for d in pd.date_range(start='01/01/2015', end=str(date.today()), freq='1M')]
oneOffs = {
(2019, 7): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2019/nvestor-weekly-carloads-july-2019.pdf",
(2019, 9): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2019/nvestor-weekly-carloads-september-2019.pdf",
(2021, 9): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2021/investor-weekly-carloads-setember-2021.pdf",
}
monthNames = {
1: 'january', 2: 'february', 3: 'march',
4: 'april', 5: 'may', 6: 'june',
7: 'july', 8: 'august', 9: 'september',
10: 'october', 11: 'november', 12: 'december'
}
nsLinks = [oneOffs.get((d.year, d.month), nsBase.format(d.year, monthNames.get(d.month))) for d in nsDates]
#nsLinks
makeNumeric = lambda v: pd.to_numeric(re.sub("[^.0-9-]", "", str(v)), errors='coerce')
def handle(df_ns, dt, url):
date = pd.to_datetime(dt)
priorDt = str((pd.to_datetime(dt) - pd.DateOffset(weeks=52)).date())
#print(len(df_ns.columns))
try:
if len(df_ns.columns) < 10:
df_ns2 = df_ns.iloc[10:, :2].copy()
df_ns2.columns = ['Key', 'Values'] #+ df_ns2.columns.to_list()[3:]
df_ns2 = df_ns2.set_index('Key').dropna(how='all') #.iloc[1:]
#print(df_ns2['Loadings'])
df_ns2[[dt, priorDt]] = df_ns2['Values'].str.split(" ", expand=True).iloc[:, :2].applymap(makeNumeric)
else:
#print(df_ns.iloc[:, :3].head())
df_ns2 = df_ns.iloc[:, :3].copy()
df_ns2.columns = ['Key', dt, priorDt]
df_ns2 = df_ns2.set_index('Key').dropna(how='all') #.iloc[1:]
#print(df_ns2['Loadings'])
except Exception as e:
print(f"Failed {url}")
print(e)
return None
#print(df_ns2.iloc[:, -2:].dropna(how='all').head())
return df_ns2.iloc[:, -2:].dropna(how='all')
def handleOneOff(df, dt):
tmp = df.iloc[4:, 2:8]
try:
tmp.columns = ['Key', 'one', 'two', 'three', 'four', dt]
except ValueError:
return None
return tmp.set_index('Key').iloc[:, -1] if (tmp.shape[0] == 25) else None
def getDatesFrom(byteData, df_ns):
try:
raw = parser.from_buffer(byteData)
if raw['content']:
dts = re.findall(".* To: ([\d]{2}-[\d]{2}-[\d]{4}).*", raw['content'])
else:
#print(type(df_ns[0]))
dts = [re.search(".* To: ([\d]{2}-[\d]{2}-[\d]{4}).*", df.columns[1]).group(1) for df in df_ns]
except AttributeError as ae:
print(f"Failed on {url}")
print(ae)
#print(df_ns.columns[1], df_ns.columns)
return None
except ValueError as ve:
print(f"Failed on {url}")
print(ve)
#print(df_ns.columns[1], df_ns.columns)
return None
return dts
@memory.cache
def fetch(url):
req = Request(url, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
byteData = BytesIO(data.read())
#print(f"Parsing {url}...")
return byteData, read_pdf(byteData, pages='all')
#@memory.cache
def parseNorfolkSouthern(url, retries=3, empty=""):
"""
NorfolkSouthern Parser
"""
if retries < 0:
return None
try:
#print(f"Parsing {url}...")
byteData, df_ns = fetch(url)
byteData.seek(0)
dts = getDatesFrom(byteData, df_ns)
# Handle the "one-offs" - really this is the majority of cases
if any(map(lambda e: len(e.columns) > 1 and "Norfolk" in e.columns[1], df_ns)):
return pd.concat([handleOneOff(f, t) for f, t in zip(df_ns, dts)], axis=1)
#for n, df in enumerate(df_ns):
# df.to_csv(f"{n}.csv", index=False)
except HTTPError as he:
print(f"Failed on {url}")
print(he)
#print(df_ns.columns[1], df_ns.columns)
return None
except Exception as e:
print(e)
sleep(2 ** (3 - retries))
return parseNorfolkSouthern(location, retries-1)
#print(dts)
combined = [handle(f, t, url) for f, t in zip(df_ns, dts)]
return pd.concat(combined, axis=1) if any(c.shape[0] for c in combined if c is not None) else None
%%time
dfs_ns = [parseNorfolkSouthern(l, 2, "") for l in nsLinks]
df_ns_ts = pd.concat(dfs_ns, axis=1).T
# df_ns_merged.to_csv("norfolk_southern.csv", sep='|')
# df_ns_merged = pd.read_csv("norfolk_southern.csv", sep="|")
df_ns_ts = df_ns_ts.applymap(makeNumeric)
# http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2021/investor-weekly-carloads-december-2021.pdf
df_ns_ts['dt'] = df_ns_ts.index.map(pd.to_datetime)
doChart(df=df_ns_ts, metric='Motor Vehicles and Equipment', size=6, color='black', carrier='Northfolk Southern', domain=[-100, 100])
doChart(df=df_ns_ts, metric='Lumber and Wood Products', carrier='Northfolk Southern', size=5, color='forestgreen')
doChart(df=df_ns_ts, metric='Pulp, Paper and Allied Products', carrier='Northfolk Southern', size=5, color='forestgreen')
doChart(df=df_ns_ts, metric='Primary Forest Products', carrier='Northfolk Southern', size=5, color='forestgreen')
doChart(df=df_ns_ts, metric='Chemicals', carrier='Northfolk Southern', size=5, color='darkseagreen')
doChart(df=df_ns_ts, metric='Iron and Steel Scrap', carrier='Northfolk Southern', size=5, color='grey')
doChart(df=df_ns_ts, metric='Total Carloadings', carrier='Northfolk Southern', size=5)
doChart(df=df_ns_ts, metric='Total Intermodal', carrier='Northfolk Southern', size=5)
doChart(df=df_ns_ts, metric='Metals and Products', carrier='Northfolk Southern', size=5, color='grey')
doChart(df=df_ns_ts, metric='Metallic Ores', carrier='Northfolk Southern', size=5, color='black')
doChart(df=df_ns_ts, metric='Container', carrier='Northfolk Southern', size=5)
# https://bnsf.com/about-bnsf/financial-information/index.html#Weekly+Carload+Reports
base = "https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/{}.pdf"
dates = [str(d).replace("-", "").split()[0] for d in pd.date_range(start='05/16/2009', end=str(date.today()), freq='7D')]
errata = {
"20200404": "https://web.archive.org/web/20200621144255/https://www.bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20200404.pdf"
}
links = [errata.get(d, base.format(d)) for d in dates]
%%time
uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
cleanValue = lambda v: "-" + re.sub("[^.0-9]", "", str(v)) if "-" in str(v) else re.sub("[^.0-9]", "", str(v))
@memory.cache
def parseBnsf(link, retries=3):
"""
BNSF Parser
"""
if retries < 0:
print("Failed " + link)
#print(e)
return None
location = link
try:
dt = location.split("/")[-1].split(".")[0]
req = Request(location, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
df = read_pdf(BytesIO(data.read()), pages='all')[0]
df_t = df.set_index(df.columns[0]).iloc[:, 2]
df_t = df_t.dropna().apply(cleanValue).apply(pd.to_numeric)
df_t = df_t.T
df_t['dt'] = pd.to_datetime(dt)
df_t['source'] = df_t.index.map(lambda v: location)
df_to = df.set_index(df.columns[0]).iloc[:, 5]
df_to = df_to.dropna().apply(cleanValue).apply(pd.to_numeric)
df_to = df_to.T
df_to['dt'] = pd.to_datetime(dt) - pd.DateOffset(weeks=52)
return pd.concat([df_t, df_to], axis=1)
except Exception as e:
sleep(2 ** (3 - retries))
#print(e)
return parseBnsf(location, retries-1)
dfs = [parseBnsf(l) for l in set(links)]
df_bnsf = pd.concat(dfs, axis=1).T.reset_index() #.shape
df_bnsf.columns = ['Total Intermodal'] + df_bnsf.columns[1:].to_list()
use = [c for c in df_bnsf.columns if "source" not in c]
df_bnsf['Total Intermodal'] = df_bnsf['Total Intermodal'].apply(cleanValue).apply(makeNumeric)
# df_bnfs.to_csv("bnsf.csv", sep="|")
#df_bnsf = pd.read_csv('bnsf.csv', sep="|")
df_bnsf['dt'] = df_bnsf['dt'].apply(pd.to_datetime)
df_bnsf_ts = df_bnsf[use] #.groupby('dt').agg('mean').applymap(float).resample('1M').nearest().reset_index()
df_melted = df_bnsf[use].melt(id_vars='dt')
df_melted['value'] = df_melted['value'].apply(float)
df_melted = df_melted.groupby(['dt', 'variable']).agg('mean').reset_index() #.set_index('dt')
df_tab = df_melted.pivot_table(index='dt', columns='variable') #.resample('1M').mean()
df_bnsf_ts = df_tab.droplevel(0, axis=1).reset_index()
doChart(df=df_bnsf_ts, metric='Total Intermodal', color='slategray', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='LUMBER/WOOD', color='forestgreen', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='PULP/PAPER', color='forestgreen', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='FOREST PRODUCTS', color='forestgreen', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='CHEMICALS', color='darkseagreen', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='FOOD', color='green', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='MOTOR VEHICLES', color='darkblue', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='IRON & STEEL SCRAP', color='slategrey', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='METALS', color='black', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='COAL', color='black', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='GRAIN', color='goldenrod', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='SAND/GRAVEL', color='grey', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='SAND/GRAVEL', color='goldenrod', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='PETROLEUM', color='green', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='STONE/CLAY/GLASS', color='gray', size=7, resample='3M', carrier='BNSF')
doChart(df=df_bnsf_ts, metric='OTHER', color='gray', size=7, resample='3M', carrier='BNSF')