US Macroeconomic Data Explorer | US Railroad Freight Traffic

US Railroad Freight Traffic


A look at railroad freight volumes, according to reports obtained from BNSF, CSX, Norfolk Southern, and Union Pacific.

In [1]:
import pandas as pd
import altair as alt
import numpy as np
import re
from joblib import Memory
from time import sleep
from datetime import date
from IPython.core.display import HTML

from tabula import read_pdf
from tika import parser

from urllib.request import urlopen, Request
from urllib.parse import urlencode
from urllib.error import URLError
from http.client import InvalidURL
from json import dumps, loads

# need {unidecode, fake_useragent}

#from googleapi.google import search as googleSearch
from urllib.error import HTTPError
from io import BytesIO
from os import environ

memory = Memory('data/', verbose=0)

uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

if environ.get("GITHUB_WORKFLOW"):
    raise RuntimeError("Requires manual updates")
    
currentYear = date.today().year
In [2]:
%%capture
%%time

@memory.cache
def parseCsxReport(urlAndWeek):
    try:
        url, weekAndYear = urlAndWeek
    
        # '2013 Week 29'
        year, week = weekAndYear.split(' Week ')
        
    except ValueError:
        try:
            url, weekAndYear = urlAndWeek
    
            # '2013 Week 29'
            year, week = weekAndYear.split('-WK')
        except ValueError:
            print(urlAndWeek)
            return None
    
    #            WEEK   28                QUARTER TO DATE           YEAR TO DATE
    baseCols = ['Cargo', 'Week {0} {1}', 'Week {0} {2}', 'Week {0} pct-change',
                'Week {0} {1} QTD', 'Week {0} {2} QTD', 'Week {0} QTD pct-change',
                'Week {0} {1} YTD', 'Week {0} {2} YTD', 'Week {0} YTD pct-change']

    try:
        cols = [c.format(week.split(' ')[0], int(year), int(year)-1) for c in baseCols]
    except ValueError:
        print(urlAndWeek)
        return None
    
    try:
        tmp = read_pdf(url, pages='all', pandas_options={ 'names': cols })
        df = pd.concat(tmp) if isinstance(tmp, list) else tmp
        return df.iloc[1:, :].set_index('Cargo').applymap(lambda v: float(re.sub('[^0-9.]', '', v)))
    except AttributeError:
        print(urlAndWeek)
        return None
    except ValueError:
        print(urlAndWeek)
        return None
    except HTTPError:
        print(urlAndWeek)
        return None        
        
#parseReport(('http://csx.gcs-web.com/static-files/f2baf981-9658-41f8-9ec0-400e9fdbde11', '2013 Week 29')).head(30)

#parseReport(('http://csx.gcs-web.com/static-files/2d4f737e-1314-46aa-9e8c-a29b40be2a2b', '2019 Week 49 AAR')).head(30)

def getWeekFrom(u):
    return re.findall("(?:Week-|WK|Week_|w|week)(\d\d?)", u)[0]

#getWeekFrom('https://s2.q4cdn.com/859568992/files/doc_downloads/volume_trends/2014/CSX_AAR-2014-WK20_csx.pdf')

def getUrlsAndWeeks(res):
    urls = []
    for item in res['GetContentAssetListResult']:
        u = item['FilePath']

        if "2020" in u:
            year = 2020
        elif "2021" in u:
            year = 2021
        elif "2022" in u:
            year = 2022
        else:
            year = u.split("/")[-2]
        try:
            week = getWeekFrom(u)
            yearAndWeek = f"{year} Week {week}"
            urls.append((u, yearAndWeek))
        except IndexError:
            continue
            
    return urls

reportListUrl = "https://investors.csx.com/Services/ContentAssetService.svc/GetContentAssetList"

params = lambda y: {
    "serviceDto": {
        "ViewType": "2",
        "ViewDate": "",
        "RevisionNumber": "1",
        "LanguageId": "1",
        "Signature": "",
        "ItemCount": -1,
        "StartIndex": 0,
        "TagList": [],
        "IncludeTags": True
    },
    "assetType": "Volume Trends",
    "excludeSelection": 1,
    "year": y
}

headers = {
    'User-Agent': uaString,
    'Content-Type': 'application/json; charset=UTF-8'
}

urls = []
for y in range(2014, currentYear + 1):
    
    req = Request(reportListUrl, data=dumps(params(y)).encode('ascii'), headers=headers)

    data = urlopen(req)
    
    res_dict = loads(data.read())
    
    urls = urls + getUrlsAndWeeks(res_dict)

dfs = map(parseCsxReport, urls)

merged = pd.concat(dfs, axis=1, sort=False).T

#merged.head()

subset = merged.filter(axis='index', regex='[0-9]$').copy()

subset['dt'] = subset.index.map(lambda v: pd.to_datetime(v.replace(' AAR', '') + ' 6', format='Week %W %Y %w', errors='coerce'))
In [3]:
HTML("<h4><i>(data current as of {})</i></h4>".format(date.today().strftime("%B %d, %Y")))
Out[3]:

(data current as of April 27, 2022)

CSX Freight Reports

In [4]:
def doChart(metric, df, color='grey', size=2, resample='1M', carrier='CSX Freight', domain=[-100, 100]):
    pmp = df[['dt', metric]].sort_values('dt').copy()
    
    pmp2 = pmp.set_index('dt').resample(resample).mean().dropna().reset_index() if resample else pmp

    brush = alt.selection(type='interval', encodings=['x'])
    
    base = alt.Chart(pmp2).mark_line(color='purple').encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y')),
        alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]')),
        tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"{metric}:Q", format=",.02f")]
    ).properties(
        width=750,
        height=450
    )
    
    upper = base.mark_bar(size=size, color=color).transform_window(
        window=[alt.WindowFieldDef("lag", metric, param=12, **{ 'as': 'previous' })],
        sort=[{"field": "dt"}],
    ).transform_calculate(
        yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
    ).encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
        alt.Y('yoy:Q', axis=alt.Axis(title='52-Week Growth [%]'), scale=alt.Scale(domain=domain)),
        color=alt.condition("datum['yoy'] < 0",
            alt.value('lightsalmon'),
            alt.value(color)
        ),
        tooltip=[alt.Tooltip('dt:T', format='%B %Y', title='Period'), alt.Tooltip('yoy:Q', format=',.02f')]
    ).properties(
        title=f'{carrier} Reports: {metric}'
    )
    
    lower = base.properties(
        height=100
    ).add_selection(brush)

    return (upper & lower).properties(
        background='white'
    )
    
    #return c.display()
    
def doYoyChart(df, metric = 'MOTOR VEHICLES', carrier="BNSF", period=52, freq='W', subset=-260, size=2, color='royalblue'):
    #yoy = df.set_index('dt').pct_change(period, freq=freq).apply(lambda v: v * 100).sort_index().reset_index().copy()
    
    # .transform_calculate(
    #    val=f"datum['{metric}'] > 100 ? NaN : datum['{metric}']"
    #)
    brush = alt.selection(type='interval', encodings=['x'])
    
    base = alt.Chart(df[subset:]).mark_line(color='purple').encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y')),
        alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]')),
        tooltip=[
            alt.Tooltip("dt:T", format="%b %Y"),
            alt.Tooltip(f"{metric}:Q")
        ]
    ).properties(
        width=750,
        height=450
    )
    
    upper = base.mark_bar(size=size, color=color).transform_window(
        window=[alt.WindowFieldDef("lag", metric, param=period, **{ 'as': 'previous' })],
        sort=[{"field": "dt"}],
    ).transform_calculate(
        yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
    ).encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
        alt.Y('yoy:Q', axis=alt.Axis(title='Year-over-year Growth [%]'), impute=alt.ImputeParams(method='value', keyvals=[100, 1000])),
        color=alt.condition(f"datum['yoy'] < 0",
            alt.value('lightsalmon'),
            alt.value(color)
        ),
        tooltip=[
            alt.Tooltip('dt:T', format='%b %Y', title=''),
            alt.Tooltip(f'yoy:Q', format=',.0f', title='Year-over-year Growth [%]')
        ]
    ).properties(
        title=f"{carrier} trend for: {metric}"
    )
    
    lower = base.properties(
        height=100
    ).add_selection(brush)

    return (lower & upper).properties(
        background='white'
    )
In [5]:
metric = 'Total Traffic'
doChart(metric, subset, color='royalblue', size=4, domain=[-30, 40])
Out[5]:
In [6]:
metric = 'Total Carloads'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
Out[6]:
In [7]:
metric = 'Total Intermodal'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
Out[7]:
In [8]:
doChart('Primary Metal Products', subset, color='darkslategray', size=4, domain=[-40, 80])
Out[8]:
In [9]:
metric = 'Lumber & Wood Products'
doChart(metric, subset, color='darkolivegreen', size=4, domain=[-40, 40])
Out[9]:
In [10]:
metric = 'Motor Vehicles and Parts'
doChart(metric, subset, color='darkslategray', size=4, domain=[-100, 100])
Out[10]:
In [11]:
metric = 'Containers'
doChart(metric, subset, color='royalblue', size=4, domain=[-40, 40])
Out[11]:
In [12]:
metric = 'Coal'
doChart(metric, subset, color='black', size=4, domain=[-60, 80])
Out[12]:
In [13]:
metric = 'Chemicals'
doChart(metric, subset, color='darkseagreen', size=4, domain=[-30, 30])
Out[13]:
In [14]:
metric = 'Food Products'
doChart(metric, subset, color='darkolivegreen', size=4, domain=[-40, 40])
Out[14]:
In [15]:
metric = 'Grain'
doChart(metric, subset, color='darkseagreen', size=4, domain=[-40, 60])
Out[15]:
In [16]:
metric = 'Grain Mill Products'
doChart(metric, subset, color='teal', size=4, domain=[-20, 20])
Out[16]:

Union Pacific Freight Reports

In [17]:
# https://www.up.com/investor/aar-stb_reports/

from subprocess import CalledProcessError

upUrls = []
for y in range(2005, 2015):
    url = "'http://www.up.com/investors/reports/archive.cfm?Year={}'".format(y)
    res = !curl -L $url
    
    pdfs = re.findall('href="([^.]+.pdf)"', "".join(res))
    
    upUrls = upUrls + pdfs

@memory.cache
def parseUnionPacificArchive(u, retries=3):
    """
    Union Pacific Parser
    """
    if retries < 0:
        return None
    
    theUrl = "http://up.com" + u

    try:
        week = re.search(".*[0-9]{4}/(\d\d).pdf.*", u).group(1)
    except AttributeError:
        try:
            week = re.search(".*[0-9]{4}/week(\d\d)\_\d\d.pdf.*", u).group(1)
        except AttributeError:
            print("Failed: " + u)
            return None

    #print(week)

    try:
        req = Request(theUrl, data=None, headers={ 'User-Agent': uaString })

        data = urlopen(req)
        
        df_up = read_pdf(BytesIO(data.read()), pages='all', multiple_tables=False)[0]
        originalColumns = df_up.columns.to_list()
        df_up.columns = [str(originalColumns[1]) + '-' + str(week)] + originalColumns[1:]
        
        year = str(df_up.columns[1])
        
        if year[:2] in ['5,', '6,', '7,', '8,']:
            print(f"Invalid year for {u}")
            print(originalColumns)
            return None

        if year == '2009':
            # fetch 2008 data from the 2009 previous year column
            tmp = pd.concat([df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(year + '-' + str(week)),
                             df_up.set_index(df_up.columns[0])[df_up.columns[2]].rename("2008" + '-' + str(week))], axis=1)
        else:
            tmp = df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(year + '-' + str(week))

        return tmp
    except CalledProcessError:
        return None
    except KeyError:
        print("Failed: " + u)
        sleep(2 ** (3 - retries))
        return parseUnionPacificArchive(theUrl, retries-1)
    except IndexError:
        print("Failed: " + u)
        sleep(2 ** (3 - retries))
        return parseUnionPacificArchive(theUrl, retries-1)
    except URLError:
        print("Failed: " + u)
        sleep(2 ** (3 - retries))
        return parseUnionPacificArchive(theUrl, retries-1)
        
#parseArchive(upUrls[1]).head()
In [ ]:
%%capture
#%%time

dfs_a = []
for u in upUrls:
    if "changes" in u:
        continue
    
    dfs_a.append(parseUnionPacificArchive(u))

df_aone = pd.concat(dfs_a, axis=1).T


validCols = [c for c in df_aone.columns if c]

df_aone[validCols] = df_aone[validCols].applymap(lambda v: str(v).replace(",", "") if not pd.isnull(v) else v)\
                        .applymap(lambda v: pd.to_numeric(v, errors='coerce'))

cleanColumn = lambda s: re.sub("^\d+ ", "", str(s)).replace(", N.E.C", "").replace("Prod.", "Products")

df_aone.columns = [cleanColumn(c) for c in df_aone.columns]

#df_aone.columns

tab = df_aone.T.reset_index().melt(id_vars='index').groupby(['index', 'variable']).agg('mean')\
        .reset_index().pivot(index='variable', columns='index')

tab = tab.droplevel(level=0, axis=1)
tab.index = tab.index.rename('dt')

#tab.head()
In [ ]:
%%capture
#%%time

# curl -L 'https://www.up.com/up/investor/aar-stb_reports/2015_carloadings/index.htm'

@memory.cache
def getUrsl2():
    upUrls2 = []
    for y in range(2015, currentYear):
        url = "'https://www.up.com/up/investor/aar-stb_reports/{}_carloadings/index.htm'".format(y)
        if y == 2021:
            url = "'https://www.up.com/up/investor/aar-stb_reports/2021_Carloads/index.htm'"

        res = !curl -L $url

        pdfs = re.findall('href="([^.]+.pdf)"', "".join(res))

        upUrls2 = upUrls2 + pdfs
    return upUrls2
        
# For current year
new_url = "https://www.up.com/up/investor/aar-stb_reports/2022_carloads/index.htm"
new_res = !curl -L $new_url
new_pdfs = re.findall('href="([^.]+.pdf)"', "".join(new_res))
upUrls2 = getUrsl2() + new_pdfs

@memory.cache
def parseUnionPacificArchive2(url, retries=3):
    """
    Union Pacific Parser 2
    """
    if retries < 0:
        return None
    
    theUrl2 = "http://up.com" + url

    try:
        week = re.search(".*_[\d]{4}-(\d\d?).pdf.*", url).group(1)
    except AttributeError:
        try:
            week = re.search(".*week_(\d\d?)\_[\d]{4}[^.]+.pdf.*", url).group(1)
        except AttributeError:
            try:
                week = re.search(".*_[\d]{4}_(\d\d?)[^\.]*.pdf.*", url).group(1)
            except AttributeError:
                print("Failed week: " + url)
                return None

    #print(week)

    try:
        req = Request(theUrl2, data=None, headers={ 'User-Agent': uaString })

        data = urlopen(req)
        
        df_up = read_pdf(BytesIO(data.read()), pages='all', multiple_tables=False)[0]
        df_up.columns = [str(df_up.columns[1]) + '-' + str(week)] + df_up.columns.to_list()[1:]

        tmp = df_up.set_index(df_up.columns[0])[df_up.columns[1]].rename(str(df_up.columns[1]) + '-' + str(week))

        return tmp
    except IndexError:
        print("Failed index: " + url)
        sleep(2 ** (3 - retries))
        return parseUnionPacificArchive2(url, retries-1)
    except HTTPError:
        print("Failed url: " + url)
        sleep(2 ** (3 - retries))
        return parseUnionPacificArchive2(url, retries-1)
    except InvalidURL:
        print("Failed url: " + url)
        return None
    
# parseArchive2(upUrls2[2]).head()

dfs_atwo = []
for u in upUrls2:
    if "changes" in u:
        continue
    
    dfs_atwo.append(parseUnionPacificArchive2(u))

df_atwo = pd.concat(dfs_atwo, axis=1).T# .head()

validCols = [c for c in df_atwo.columns if c]

df_atwo[validCols] = df_atwo[validCols].applymap(lambda v: str(v).replace(",", "") if not pd.isnull(v) else v)\
                        .applymap(lambda v: pd.to_numeric(v, errors='coerce'))

#df_atwo.columns
In [20]:
df_unionpac = pd.concat([tab, df_atwo])

df_unionpac['dt'] = df_unionpac.index.map(lambda v: pd.to_datetime(v+"-6", format="%Y-%W-%w", errors='coerce'))

#df_unionpac.head()

df_unionpac_yoy = df_unionpac.groupby('dt').agg('mean').resample('1W').nearest().pct_change(52, freq='W').apply(lambda v: v * 100).reset_index()

#df_unionpac_yoy.tail()
In [21]:
def doUpChart(df, metric='Total Carloads', color='purple'):
    return alt.Chart(df_unionpac.reset_index()[['dt', metric]]).mark_line(color=color).encode(
        alt.X('dt:T', axis=alt.Axis(title='')),
        alt.Y('{}:Q'.format(metric))
    ).properties(
        title=f"Union Pacific Railroad: {metric} History",
        width=750,
        height=400
    )
    
doUpChart(df_unionpac, 'Total Carloads')
Out[21]:
In [22]:
def doYoyUpChart(df, metric, color='royalblue', domain=[-30, 30]):

    brush = alt.selection(type='interval', encodings=['x'])
    
    base = alt.Chart(df).mark_line(color='purple').encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), impute=alt.ImputeParams(value=None)),
        alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [carloads]'), impute=alt.ImputeParams(value=None)),
        tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"{metric}:Q", format=",.02f")]
    ).properties(
        width=750,
        height=450
    )
    
    upper = base.mark_bar(size=1.5, color=color).transform_window(
        window=[alt.WindowFieldDef("lag", metric, param=52, **{ 'as': 'previous' })],
        sort=[{"field": "dt"}],
    ).transform_calculate(
        yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
    ).encode(
        alt.X('dt:T', axis=alt.Axis(title='', format='%b %Y'), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
        alt.Y('yoy:Q', axis=alt.Axis(title='Year-over-year Growth [%]'), scale=alt.Scale(domain=domain)),
        color=alt.condition(f"datum['yoy'] < 0",
            alt.value('lightsalmon'),
            alt.value(color)
        ),
        tooltip=[alt.Tooltip('dt:T', format='%b %Y'), alt.Tooltip(f'yoy:Q', format=',.0f')]
    ).properties(
        title=f"Union Pacific Railroad: {metric} Traffic"
    )
    
    lower = base.properties(
        height=100
    ).add_selection(brush)

    return (upper & lower).properties(
        background='white'
    )

c = doYoyUpChart(df_unionpac, 'Total Carloads')
c.save('transportation-rail.png')

c.display()
In [23]:
metric = 'Chemicals'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='darkseagreen')
Out[23]:
In [24]:
metric = 'Lumber & Wood Products'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='forestgreen', domain=[-60, 60])
Out[24]:
In [25]:
metric = 'Iron & Steel Scrap'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='slategray', domain=[-80, 80])
Out[25]:
In [26]:
metric = 'Metals & Products'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='slategray', domain=[-80, 120])
Out[26]:
In [27]:
metric = 'Motor Vehicles & Equipment'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='black', domain=[-85, 125])
Out[27]:
In [28]:
metric = 'Metallic Ores'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='darkslategray', domain=[-85, 170])
Out[28]:
In [29]:
metric = 'Nonmetallic Minerals'
#doUpChart(df_unionpac, metric)

doYoyUpChart(df_unionpac, metric, color='gray', domain=[-65, 100])
Out[29]:

Norfolk Southern Freight Reports

In [30]:
nsBase = "https://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/{0}/investor-weekly-carloads-{1}-{0}.pdf"

nsDates = [d for d in pd.date_range(start='01/01/2015', end=str(date.today()), freq='1M')]

oneOffs = {
    (2019, 7): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2019/nvestor-weekly-carloads-july-2019.pdf",
    (2019, 9): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2019/nvestor-weekly-carloads-september-2019.pdf",
    (2021, 9): "http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2021/investor-weekly-carloads-setember-2021.pdf",
}

monthNames = {
    1: 'january', 2: 'february', 3: 'march',
    4: 'april', 5: 'may', 6: 'june',
    7: 'july', 8: 'august', 9: 'september',
    10: 'october', 11: 'november', 12: 'december'
}
nsLinks = [oneOffs.get((d.year, d.month), nsBase.format(d.year, monthNames.get(d.month))) for d in nsDates]

#nsLinks
In [31]:
makeNumeric = lambda v: pd.to_numeric(re.sub("[^.0-9-]", "", str(v)), errors='coerce')

def handle(df_ns, dt, url):
    date = pd.to_datetime(dt)
    priorDt = str((pd.to_datetime(dt) - pd.DateOffset(weeks=52)).date())
    #print(len(df_ns.columns))
    try:
        if len(df_ns.columns) < 10:
            df_ns2 = df_ns.iloc[10:, :2].copy()
            df_ns2.columns = ['Key', 'Values'] #+ df_ns2.columns.to_list()[3:]
            df_ns2 = df_ns2.set_index('Key').dropna(how='all') #.iloc[1:]

            #print(df_ns2['Loadings'])

            df_ns2[[dt, priorDt]] = df_ns2['Values'].str.split(" ", expand=True).iloc[:, :2].applymap(makeNumeric)

        else:
            #print(df_ns.iloc[:, :3].head())
            df_ns2 = df_ns.iloc[:, :3].copy()
            df_ns2.columns = ['Key', dt, priorDt]
            df_ns2 = df_ns2.set_index('Key').dropna(how='all') #.iloc[1:]

            #print(df_ns2['Loadings'])

    except Exception as e:
        print(f"Failed {url}")
        print(e)
        return None
    
    #print(df_ns2.iloc[:, -2:].dropna(how='all').head())
    return df_ns2.iloc[:, -2:].dropna(how='all')

def handleOneOff(df, dt):
    tmp = df.iloc[4:, 2:8]
    try:
        tmp.columns = ['Key', 'one', 'two', 'three', 'four', dt]
    except ValueError:
        return None
        
    return tmp.set_index('Key').iloc[:, -1] if (tmp.shape[0] == 25) else None

def getDatesFrom(byteData, df_ns):
    try:
        raw = parser.from_buffer(byteData)
        
        if raw['content']:
            dts = re.findall(".* To: ([\d]{2}-[\d]{2}-[\d]{4}).*", raw['content'])
        else:
            #print(type(df_ns[0]))
            dts = [re.search(".* To: ([\d]{2}-[\d]{2}-[\d]{4}).*", df.columns[1]).group(1) for df in df_ns]
    except AttributeError as ae:
        print(f"Failed on {url}")
        print(ae)
        #print(df_ns.columns[1], df_ns.columns)
        return None
    except ValueError as ve:
        print(f"Failed on {url}")
        print(ve)
        #print(df_ns.columns[1], df_ns.columns)
        return None
    
    return dts

@memory.cache
def fetch(url):
    req = Request(url, data=None, headers={ 'User-Agent': uaString })
    data = urlopen(req)

    byteData = BytesIO(data.read())

    #print(f"Parsing {url}...")
    return byteData, read_pdf(byteData, pages='all')
    
#@memory.cache
def parseNorfolkSouthern(url, retries=3, empty=""):
    """
    NorfolkSouthern Parser
    """
    if retries < 0:
        return None
    
    try:
        #print(f"Parsing {url}...")
        byteData, df_ns = fetch(url)
        
        byteData.seek(0)
        dts = getDatesFrom(byteData, df_ns)
        
        # Handle the "one-offs" - really this is the majority of cases
        if any(map(lambda e: len(e.columns) > 1 and "Norfolk" in e.columns[1], df_ns)):
            return pd.concat([handleOneOff(f, t) for f, t in zip(df_ns, dts)], axis=1)
            
        #for n, df in enumerate(df_ns):
        #    df.to_csv(f"{n}.csv", index=False)
    except HTTPError as he:
        print(f"Failed on {url}")
        print(he)
        #print(df_ns.columns[1], df_ns.columns)
        return None
    except Exception as e:
        print(e)
        sleep(2 ** (3 - retries))
        return parseNorfolkSouthern(location, retries-1)

    #print(dts)
    combined = [handle(f, t, url) for f, t in zip(df_ns, dts)]
    return pd.concat(combined, axis=1) if any(c.shape[0] for c in combined if c is not None) else None
In [32]:
%%time

dfs_ns = [parseNorfolkSouthern(l, 2, "") for l in nsLinks]
    
df_ns_ts = pd.concat(dfs_ns, axis=1).T

# df_ns_merged.to_csv("norfolk_southern.csv", sep='|')

# df_ns_merged = pd.read_csv("norfolk_southern.csv", sep="|")

df_ns_ts = df_ns_ts.applymap(makeNumeric)

# http://www.nscorp.com/content/dam/nscorp/get-to-know-ns/investor-relations/performance-measures/weekly-performance-reports/2021/investor-weekly-carloads-december-2021.pdf

df_ns_ts['dt'] = df_ns_ts.index.map(pd.to_datetime)
2022-04-27 11:54:04,209 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar to /var/folders/gf/01jzxy9d4_7f26pgq7d9fxch0000gp/T/tika-server.jar.
2022-04-27 11:54:18,102 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.24/tika-server-1.24.jar.md5 to /var/folders/gf/01jzxy9d4_7f26pgq7d9fxch0000gp/T/tika-server.jar.md5.
2022-04-27 11:54:18,512 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
CPU times: user 25.8 s, sys: 2.37 s, total: 28.2 s
Wall time: 6min 1s
In [33]:
doChart(df=df_ns_ts, metric='Motor Vehicles and Equipment', size=6, color='black', carrier='Northfolk Southern', domain=[-100, 100])
Out[33]:
In [34]:
doChart(df=df_ns_ts, metric='Lumber and Wood Products', carrier='Northfolk Southern', size=5, color='forestgreen')
Out[34]:
In [35]:
doChart(df=df_ns_ts, metric='Pulp, Paper and Allied Products', carrier='Northfolk Southern', size=5, color='forestgreen')
Out[35]:
In [36]:
doChart(df=df_ns_ts, metric='Primary Forest Products', carrier='Northfolk Southern', size=5, color='forestgreen')
Out[36]:
In [37]:
doChart(df=df_ns_ts, metric='Chemicals', carrier='Northfolk Southern', size=5, color='darkseagreen')
Out[37]:
In [38]:
doChart(df=df_ns_ts, metric='Iron and Steel Scrap', carrier='Northfolk Southern', size=5, color='grey')
Out[38]:
In [39]:
doChart(df=df_ns_ts, metric='Total Carloadings', carrier='Northfolk Southern', size=5)
Out[39]:
In [40]:
doChart(df=df_ns_ts, metric='Total Intermodal', carrier='Northfolk Southern', size=5)
Out[40]:
In [41]:
doChart(df=df_ns_ts, metric='Metals and Products', carrier='Northfolk Southern', size=5, color='grey')
Out[41]:
In [42]:
doChart(df=df_ns_ts, metric='Metallic Ores', carrier='Northfolk Southern', size=5, color='black')
Out[42]:
In [43]:
doChart(df=df_ns_ts, metric='Container', carrier='Northfolk Southern', size=5)
Out[43]:
In [44]:
# https://bnsf.com/about-bnsf/financial-information/index.html#Weekly+Carload+Reports

base = "https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/{}.pdf"
dates = [str(d).replace("-", "").split()[0] for d in pd.date_range(start='05/16/2009', end=str(date.today()), freq='7D')]

errata = {
    "20200404": "https://web.archive.org/web/20200621144255/https://www.bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20200404.pdf"
}

links = [errata.get(d, base.format(d)) for d in dates]
In [45]:
%%time

uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'

cleanValue = lambda v: "-" + re.sub("[^.0-9]", "", str(v)) if "-" in str(v) else re.sub("[^.0-9]", "", str(v))

@memory.cache
def parseBnsf(link, retries=3):
    """
    BNSF Parser
    """
    if retries < 0:
        print("Failed " + link)
        #print(e)
        return None
    
    location = link
    
    try:
        dt = location.split("/")[-1].split(".")[0]
        
        req = Request(location, data=None, headers={ 'User-Agent': uaString })

        data = urlopen(req)

        df = read_pdf(BytesIO(data.read()), pages='all')[0]

        df_t = df.set_index(df.columns[0]).iloc[:, 2]
        df_t = df_t.dropna().apply(cleanValue).apply(pd.to_numeric)
        df_t = df_t.T
        df_t['dt'] = pd.to_datetime(dt)
        df_t['source'] = df_t.index.map(lambda v: location) 

        df_to = df.set_index(df.columns[0]).iloc[:, 5]
        df_to = df_to.dropna().apply(cleanValue).apply(pd.to_numeric)
        df_to = df_to.T
        df_to['dt'] = pd.to_datetime(dt) - pd.DateOffset(weeks=52)
        
        return pd.concat([df_t, df_to], axis=1)
    except Exception as e:
        sleep(2 ** (3 - retries))
        #print(e)
        return parseBnsf(location, retries-1)  
        
dfs = [parseBnsf(l) for l in set(links)]
<timed exec>:43: JobLibCollisionWarning: Cannot detect name collisions for function 'parseBnsf'
Failed https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20220312.pdf
Failed https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20220423.pdf
Failed https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20220319.pdf
Failed https://bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20220402.pdf
Got stderr: Apr 27, 2022 12:25:40 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
WARNING: Using fallback font Times-Roman for CourierStd
Apr 27, 2022 12:25:41 PM org.apache.pdfbox.pdmodel.font.PDType1Font <init>
WARNING: Using fallback font Times-Roman for CourierStd

Error from tabula-java:
Error: Error: End-of-File, expected line


Error from tabula-java:
Error: Error: End-of-File, expected line


Error from tabula-java:
Error: Error: End-of-File, expected line


Error from tabula-java:
Error: Error: End-of-File, expected line


Failed https://web.archive.org/web/20200621144255/https://www.bnsf.com/about-bnsf/financial-information/weekly-carload-reports/pdf/20200404.pdf
CPU times: user 24.6 s, sys: 7.89 s, total: 32.5 s
Wall time: 30min 5s
In [46]:
df_bnsf = pd.concat(dfs, axis=1).T.reset_index() #.shape
df_bnsf.columns = ['Total Intermodal'] + df_bnsf.columns[1:].to_list()

use = [c for c in df_bnsf.columns if "source" not in c]

df_bnsf['Total Intermodal'] = df_bnsf['Total Intermodal'].apply(cleanValue).apply(makeNumeric)

# df_bnfs.to_csv("bnsf.csv", sep="|")

#df_bnsf = pd.read_csv('bnsf.csv', sep="|")

df_bnsf['dt'] = df_bnsf['dt'].apply(pd.to_datetime)

df_bnsf_ts = df_bnsf[use] #.groupby('dt').agg('mean').applymap(float).resample('1M').nearest().reset_index()

df_melted = df_bnsf[use].melt(id_vars='dt')
df_melted['value'] = df_melted['value'].apply(float)

df_melted = df_melted.groupby(['dt', 'variable']).agg('mean').reset_index() #.set_index('dt')

df_tab = df_melted.pivot_table(index='dt', columns='variable') #.resample('1M').mean()

df_bnsf_ts = df_tab.droplevel(0, axis=1).reset_index()

BNSF Freight Reports

In [47]:
doChart(df=df_bnsf_ts, metric='Total Intermodal', color='slategray', size=7, resample='3M', carrier='BNSF')
Out[47]:
In [48]:
doChart(df=df_bnsf_ts, metric='LUMBER/WOOD', color='forestgreen', size=7, resample='3M', carrier='BNSF')
Out[48]:
In [49]:
doChart(df=df_bnsf_ts, metric='PULP/PAPER', color='forestgreen', size=7, resample='3M', carrier='BNSF')
Out[49]:
In [50]:
doChart(df=df_bnsf_ts, metric='FOREST PRODUCTS', color='forestgreen', size=7, resample='3M', carrier='BNSF')
Out[50]:
In [51]:
doChart(df=df_bnsf_ts, metric='CHEMICALS', color='darkseagreen', size=7, resample='3M', carrier='BNSF')
Out[51]:
In [52]:
doChart(df=df_bnsf_ts, metric='FOOD', color='green', size=7, resample='3M', carrier='BNSF')
Out[52]:
In [53]:
doChart(df=df_bnsf_ts, metric='MOTOR VEHICLES', color='darkblue', size=7, resample='3M', carrier='BNSF')
Out[53]:
In [54]:
doChart(df=df_bnsf_ts, metric='IRON & STEEL SCRAP', color='slategrey', size=7, resample='3M', carrier='BNSF')
Out[54]:
In [55]:
doChart(df=df_bnsf_ts, metric='METALS', color='black', size=7, resample='3M', carrier='BNSF')
Out[55]:
In [56]:
doChart(df=df_bnsf_ts, metric='COAL', color='black', size=7, resample='3M', carrier='BNSF')
Out[56]:
In [57]:
doChart(df=df_bnsf_ts, metric='GRAIN', color='goldenrod', size=7, resample='3M', carrier='BNSF')
Out[57]:
In [58]:
doChart(df=df_bnsf_ts, metric='SAND/GRAVEL', color='grey', size=7, resample='3M', carrier='BNSF')
Out[58]:
In [59]:
doChart(df=df_bnsf_ts, metric='SAND/GRAVEL', color='goldenrod', size=7, resample='3M', carrier='BNSF')
Out[59]:
In [60]:
doChart(df=df_bnsf_ts, metric='PETROLEUM', color='green', size=7, resample='3M', carrier='BNSF')
Out[60]:
In [61]:
doChart(df=df_bnsf_ts, metric='STONE/CLAY/GLASS', color='gray', size=7, resample='3M', carrier='BNSF')
Out[61]:
In [62]:
doChart(df=df_bnsf_ts, metric='OTHER', color='gray', size=7, resample='3M', carrier='BNSF')
Out[62]:

© kdunn926