import pandas as pd
import altair as alt
import numpy as np
import re
from json import loads, JSONDecodeError
from io import BytesIO
from os import environ
from datetime import date
from IPython.core.display import HTML
from time import sleep
from joblib import Memory
from urllib.request import urlopen, Request
from urllib.error import HTTPError, URLError
from tabula import read_pdf
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.chrome.options import Options
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.common.by import By
def ajaxComplete(driver):
try:
return 0 == driver.execute_script("return jQuery.active")
except WebDriverException:
pass
chrome_options = Options()
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--headless")
#chrome_options.add_argument("download.default_directory=/tmp")
chrome_options.add_experimental_option("prefs", { "download.default_directory" : "/tmp" })
if environ.get("GITHUB_WORKFLOW"):
raise RuntimeError("Requires manual updates")
def doChart(df, location='Oakland', metric='Full-Total'):
#metric = 'Full-Total'
return alt.Chart(df.reset_index()[['dt', metric]].dropna()).mark_line(color='purple').encode(
alt.X('dt:T', axis=alt.Axis(title='')),
alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [TEUs]')),
tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"{metric}:Q", format=",.02f")]
).properties(
title=f'Port of {location} {metric} Container Volume',
width=750,
height=400
)
def doYoyChart(df, location='Oakland', metric='Full-Total', width=2, format=None):
brush = alt.selection(type='interval', encodings=['x'])
kwargs = {} if format is None else { "format": format }
base = alt.Chart(df.reset_index()[['dt', metric]].dropna()).mark_line(color='purple').encode(
alt.X('dt:T', axis=alt.Axis(title='', **kwargs), impute=alt.ImputeParams(value=None)),
alt.Y(f'{metric}:Q', axis=alt.Axis(title='Volume [TEUs]')),
tooltip=[alt.Tooltip("dt:T", format="%b %Y"), alt.Tooltip(f"nolarge:Q", format=",.02f")]
).properties(
width=750,
height=400
)
upper = base.mark_bar(width=width, color='blue').transform_window(
window=[alt.WindowFieldDef("lag", metric, param=12, **{ 'as': 'previous' })],
sort=[{"field": "dt"}],
).transform_calculate(
yoy=f"((datum['{metric}'] - datum.previous) / datum.previous) * 100"
).encode(
alt.X('dt:T', axis=alt.Axis(title='', **kwargs), scale=alt.Scale(domain=brush), impute=alt.ImputeParams(value=None)),
alt.Y(f'yoy:Q', axis=alt.Axis(title='Year-over-Year Volume Growth [%]'), impute=alt.ImputeParams(method='value', keyvals=[400, 1000])),
color=alt.condition(f"datum['yoy'] < 0",
alt.value('lightsalmon'),
alt.value('royalblue')
)
).properties(
title=f'Port of {location} {metric} Container Volume',
)
lower = base.properties(
height=100
).add_selection(brush)
return (upper & lower).properties(background='white')
uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
memory = Memory('data/', verbose=0)
HTML("<h4><i>(data current as of {})</i></h4>".format(date.today().strftime("%B %d, %Y")))
# Port of Los Angeles
# https://www.portoflosangeles.org/business/statistics/container-statistics
# 1995
# https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-2019
@memory.cache
def fetchLaData(u, year, cols, args = { 'skiprows': 1 }):
req = Request(u, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
tmp = pd.read_html(BytesIO(data.read()), **args)[0]
try:
tmp.columns = ['Month'] + cols
except ValueError as e:
#print(tmp.head())
print(e)
tmp['dt'] = tmp['Month'].apply(lambda v: pd.to_datetime("{}-{}".format(v, year), format="%B-%Y", errors='coerce'))
return tmp
def fetchLaDataNoCache(u, year, cols, args = { 'skiprows': 1 }):
req = Request(u, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
tmp = pd.read_html(BytesIO(data.read()), **args)[0]
try:
tmp.columns = ['Month'] + cols
except ValueError as e:
#print(tmp.head())
print(e)
tmp['dt'] = tmp['Month'].apply(lambda v: pd.to_datetime("{}-{}".format(v, year), format="%B-%Y", errors='coerce'))
return tmp
colsPre96 = ['Loaded Imports', 'Empty Imports', 'Total Imports',
'Loaded Export', 'Empty Exports', 'Total TEUs', 'Prior Year Change']
cols = ['Loaded Imports', 'Empty Imports', 'Total Imports',
'Loaded Export', 'Empty Exports', 'Total Exports', 'Total TEUs',
'Prior Year Change']
args = {
'skiprows': 1
}
urlFor = lambda y: "https://www.portoflosangeles.org/business/statistics/container-statistics/historical-teu-statistics-{}".format(y)
currentYear = date.today().year
dfs_la = []
for y in range(1995, currentYear + 1):
url = urlFor(y)
#print(url)
if y < 1996:
dfs_la.append(fetchLaData(url, y, colsPre96, args))
elif y == currentYear:
dfs_la.append(fetchLaDataNoCache(url, y, cols, {}))
else:
dfs_la.append(fetchLaData(url, y, cols, {}))
df_la = pd.concat(dfs_la).dropna() #.tail()
df_la[cols[:1]] = df_la[cols[:1]].apply(pd.to_numeric)
extractValue = lambda v: float(re.sub("\.", "", v, count=1).replace(",", "")) if v.count(".") > 1 else float(v)
for c in df_la.columns:
if c in ['Month', 'dt', 'Prior Year Change']:
continue
try:
df_la[c] = df_la[c].apply(float)
except ValueError:
df_la[c] = df_la[c].apply(extractValue)
#df_la.head()
#df_la_smoothed = df_la.set_index('dt').resample('2M').mean().reset_index()
c = doYoyChart(df_la, location='Los Angeles', metric='Loaded Imports')
c.save('transportation-ocean.png')
c.display()
doYoyChart(df_la, location='Los Angeles', metric='Loaded Export')
doYoyChart(df_la, location='Los Angeles', metric='Total Imports')
doYoyChart(df_la, location='Los Angeles', metric='Total TEUs')
# http://www.polb.com/economics/stats/teus_archive.asp
uaString = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
req = Request("https://thehelm.polb.com/stellar_custom_table/table100/", data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
df_lb_raw = pd.read_html(BytesIO(data.read()), displayed_only=False)[0].iloc[:-1]
df_lb_raw = df_lb_raw.iloc[:, 1:]
df_lb_raw[df_lb_raw.columns[1:]] = df_lb_raw[df_lb_raw.columns[1:]].apply(pd.to_numeric)
df_lb_raw['dt'] = df_lb_raw['Date'].map(lambda v: pd.to_datetime(v, format='%b %Y'))
#df_lb_raw.head(16)
doYoyChart(df_lb_raw, location='Long Beach', metric='Total Loaded')
doYoyChart(df_lb_raw, location='Long Beach', metric='Total')
doYoyChart(df_lb_raw, location='Long Beach', metric='Total Empties')
doYoyChart(df_lb_raw, location='Long Beach', metric='Loaded Inbound')
dataUrl = "https://www.panynj.gov/content/port/en.model.json"
#jqPath = '\'.":children"."/port/en/our-port/facts-and-figures".":items"."root".":items"."responsivegrid".":items"."accordionlist_435572874".":items"\''
def doNynjFetch(retries = 5):
if retries < 0:
return loads("{}")
try:
req = Request(dataUrl, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
dict_data = loads(data.read())
r = dict_data[":children"]\
["/port/en/our-port/facts-and-figures"]\
[":items"]\
["root"]\
[":items"]\
["responsivegrid"]\
[":items"]\
["accordionlist_435572874"]\
[":items"]
return r
except JSONDecodeError:
sleep(1)
return doNynjFetch(retries-1)
nynj_dict_content = doNynjFetch()
post2011Cols = ['Import Loads', 'Import Empties', 'Export Loads', 'Export Empties', 'Total Loads', 'Total Empties', 'Total TEUs', 'Total Rail Lifts']
cols = ["Import TEUs", "Export TEUs", "Total TEUs", "Total Rail Lifts"]
currentYear = date.today().year
dfs_nynj = []
for v in nynj_dict_content.values():
year = v["linkAriaLabel"]
#print(year)
try:
tmp = pd.read_html(v[":items"]["text"]["text"])
except KeyError:
# text_copy
tmp = pd.read_html(v[":items"]["text_copy"]["text"])
for n, monthData in enumerate(tmp):
# skip the YTD section
#if n < 2:
# continue
mtmp = monthData.copy()
try:
mtmp.columns = ['Period'] + post2011Cols
# cleanup types before doing math
mtmp[post2011Cols] = mtmp[post2011Cols].applymap(lambda v: re.sub("[^0-9.]", "", str(v)))
mtmp[post2011Cols] = mtmp[post2011Cols].apply(lambda v: pd.to_numeric(v, errors='coerce'))
mtmp['Import TEUs'] = mtmp['Import Loads'] + mtmp['Import Empties']
mtmp['Export TEUs'] = mtmp['Export Loads'] + mtmp['Export Empties']
# Monthy is shown Dec->Jan after 2014, Jan->Dec before 2014 (and for 2020)
index = 13-n if int(year) > 2014 and int(year) != currentYear else n
mtmp['dt'] = mtmp.iloc[:, 0].map(lambda v: pd.to_datetime("{}-{}".format(index, v), format="%m-%Y", errors='coerce'))
except ValueError:
mtmp.columns = ['Period'] + cols
mtmp['dt'] = mtmp.iloc[:, 0].map(lambda v: pd.to_datetime(v, format="%B %y", errors='coerce'))
mtmp[cols] = mtmp[cols].applymap(lambda v: re.sub("[^0-9.]", "", str(v)))
mtmp[cols] = mtmp[cols].apply(lambda v: pd.to_numeric(v, errors='coerce'))
dfs_nynj.append(mtmp)
commonFields = ['Period', 'dt', 'Import TEUs', 'Export TEUs', 'Total TEUs']
df_nynj = pd.concat(dfs_nynj)[commonFields].dropna() #.tail(30)
df_nynj = df_nynj.groupby('dt').agg('mean') #.head()
doYoyChart(df_nynj.reset_index(), location="New York/New Jersey", metric="Import TEUs")
doYoyChart(df_nynj.reset_index(), location="New York/New Jersey", metric="Export TEUs")
doYoyChart(df_nynj[12:].reset_index(), location="New York/New Jersey", metric="Total TEUs")
@memory.cache
def doGaFetch(url, retries=5):
if retries < 0:
return None
try:
req = Request(url, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
tabs = read_pdf(BytesIO(data.read()), pages='all')
# noop to test for the table
tabs[0]
return tabs
except KeyError:
sleep(1)
return doGaFetch(url, retries-1)
reportPage = 'https://gaports.com/sales/by-the-numbers/'
currentReport = !curl -s -L $reportPage | egrep -i 'Monthly.*TEU.*pdf' | perl -pe 's/.*href=\"([^"]+)\".*/\1/'
print(currentReport[0])
#current = "https://gaports.com/wp-content/uploads/2021/04/Monthly-TEU-Throughput-March-2021.pdf"
tabs = doGaFetch(currentReport[0])
cols = ['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Total']
flatten = lambda l: [item for sublist in l for item in sublist]
def parseGpTable(df, columns, start=0, end=17):
gp_raw = df.iloc[start:end, 1:].copy() if end is not None else df.iloc[start:17, :].copy()
#print(gp_raw.iloc[0, :].values)
firstRowVals = flatten(map(lambda v: str(v).split(), gp_raw.iloc[0, :-1].dropna().values))
#print(firstRowVals)
# Use all the month values in order then append the total value as the last element
firstRow = [firstRowVals[n] if n < len(firstRowVals) else '' for n in range(len(cols[:-1]))] + [gp_raw.iloc[0, -1]]
#print(firstRow)
records = gp_raw.iloc[:, 1:].apply(lambda s: flatten(map(lambda v: str(v).split(), s.values)), axis=1)
#print(records.head(1))
df_ga_melted = pd.DataFrame.from_records([firstRow] + [r for r in records.values], columns=columns).melt(id_vars='Year')
#print(df_ga_melted[df_ga_melted['Year'] == '2020'].head(15))
df_ga_melted = df_ga_melted[df_ga_melted['variable'] != 'Total']
df_ga_melted['dt'] = pd.to_datetime(df_ga_melted['Year'] + '-' + df_ga_melted['variable'], format='%Y-%b')
df_ga_melted['value'] = df_ga_melted['value'].apply(lambda v: pd.to_numeric(str(v).replace(",", ""), errors='coerce'))
df_ga_tst = df_ga_melted[['value', 'dt']].set_index('dt').resample('1M').mean().rename({'value': 'TEU Total'}, axis=1)
return df_ga_tst.reset_index()
df_ga_tst = parseGpTable(tabs[0], columns=cols, start=3, end=None)
cols2 = ['Year', 'Key', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', 'Total']
#gp_raw = tabs[0].iloc[3:21].copy()
gp_detail_raw = pd.concat([tabs[0].iloc[18:].copy()] + tabs[1:]).copy()
gp_detail_raw = gp_detail_raw[~gp_detail_raw.iloc[:, 1].isna()]
#print(gp_detail_raw)
# Exclude rows with erroneous data
gp_detail_raw = gp_detail_raw[~gp_detail_raw.iloc[:, 2].str.contains("%")]
gp_detail_raw = gp_detail_raw[gp_detail_raw.iloc[:, 1].str.match("^(I|E|T|\d)")]
#print(gp_detail_raw.head(25))
hypenate = lambda v: str(v).replace("rt ", "rt-")\
.replace("Total Full", "Total-Full")\
.replace("Total Empty", "Total-Empty")\
.replace("Total All", "Total-All")
gp_detail_raw.iloc[:, 1] = gp_detail_raw.iloc[:, 1].apply(hypenate)
"""
# Special case for the current year
firstRowVals2 = flatten(map(lambda v: hypenate(v).split(), gp_detail_raw.iloc[0, :-1].values))
firstRow2 = [firstRowVals2[n] if n < len(firstRowVals2) else '' for n in range(len(cols2[:-1]))] + [gp_detail_raw.iloc[0, -1]]
# General case for the historical years
remainingRows = gp_detail_raw.iloc[1:].apply(lambda s: flatten(map(lambda v: hypenate(v).split(), s.to_list())), axis=1)
gp_detail = pd.DataFrame.from_records([firstRow2] + [r for r in remainingRows.values], columns=cols2)
#print(gp_detail.head())
# Find all the records that don't have the year in them
rowsToShift = gp_detail[~gp_detail['Year'].str.match("^\d")].index.values
#####
# Note, this is the likely line to toggle between commented<->uncommented if seeing issues
#####
# Shift records not containing the year to the right by one to line things up
gp_detail.iloc[rowsToShift] = gp_detail.iloc[rowsToShift].apply(lambda v: pd.Series([None] + v.to_list()[:-1]), axis=1)
gp_detail.iloc[rowsToShift, 0] = pd.to_numeric('', errors='coerce')
""";
gp_detail = gp_detail_raw.copy()
gp_detail.columns = cols2
# Fill the years for the rest of the rows
gp_detail['Year'] = gp_detail['Year'].ffill(limit=8)
#print(gp_detail.head(20))
df_ga_detail_melted = gp_detail.melt(id_vars=["Key", "Year"])
df_ga_detail_melted = df_ga_detail_melted[df_ga_detail_melted["variable"] != "Total"]
df_ga_detail_melted['dt'] = pd.to_datetime(df_ga_detail_melted['Year'].apply(str).apply(lambda v: v.split(".")[0]) +\
'-' + df_ga_detail_melted['variable'], format="%Y-%b", errors='coerce')
df_ga_detail_melted['value'] = df_ga_detail_melted['value']\
.apply(lambda v: str(v).replace(",", ""))\
.apply(lambda v: pd.to_numeric(v, errors='coerce'))
#print(df_ga_detail_melted.fillna('nan').dropna().tail())
df_ga_ts = df_ga_detail_melted[['dt', 'Key', 'value']].dropna().pivot(index='dt', columns='Key').reset_index()
df_ga_ts.columns = ['dt', 'Export Empty', 'Export Full', 'Export Total',\
'Import Empty', 'Import Full', 'Import Total', 'Total All', 'Total Empty', 'Total Full']
df_ga_ts = df_ga_ts.dropna()
doYoyChart(df_ga_ts, location='Georgia', metric='Total All')
doYoyChart(df_ga_ts, location='Georgia', metric='Import Empty')
doYoyChart(df_ga_ts, location='Georgia', metric='Import Full')
doYoyChart(df_ga_ts, location='Georgia', metric='Export Total')
doYoyChart(df_ga_ts, location='Georgia', metric='Export Empty')
doYoyChart(df_ga_ts, location='Georgia', metric='Export Full')
result = !curl -s 'https://www.nwseaportalliance.com/about-us/cargo-statistics' | grep 'xlsx'
#pageCount = re.findall("of ([0-9]+)", "".join(rawPg))[0]
currentReport = [u for u in re.findall('href="([^"]+)"', "".join(result)) if "xls" in u][0]
#print(currentReport)
df_nwsea = pd.read_excel(currentReport, skiprows=2)
df_nwsea.columns = ["Month", "Measure"] + df_nwsea.columns.tolist()[2:]
df_nwsea['Month'] = df_nwsea['Month'].ffill()
df_nwsea = df_nwsea[~pd.isna(df_nwsea['Measure'])]
df_nwsea = df_nwsea[~df_nwsea['Month'].str.match("%|Total")]
df_nwsea_melt = df_nwsea.melt(id_vars=['Month', 'Measure'])
df_nwsea_melt['dt'] = pd.to_datetime(df_nwsea_melt['Month'] + "-" + df_nwsea_melt['variable'].apply(str), format='%b-%Y')\
+ pd.offsets.MonthEnd(0)
#df_nwsea_melt.tail(25)
df_nwsea_tab = df_nwsea_melt[['dt', 'Measure', 'value']].pivot_table(index='dt', columns='Measure', values='value')
df_nwsea_tab = df_nwsea_tab[df_nwsea_tab['Total TEUs'] != 0]
#df_nwsea_tab.head()
df_nwsea_tab
doYoyChart(df_nwsea_tab, metric="International Exports Full", location="Seattle/Tacoma", width=7, format='%b-%y')
doYoyChart(df_nwsea_tab, metric="International Exports Empty", location="Seattle/Tacoma", width=7, format='%b-%y')
@memory.cache
def fetchHouston(url, retries=5):
if retries < 0:
return pd.DataFrame()
try:
req = Request(url, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req)
cols = ["Date", "Loaded Imports", "Loaded Exports", "Loaded Total",
"Empty Imports", "Empty Exports", "Empty Total", "Loaded and Empty Total"]
dfs_hs = read_pdf(BytesIO(data.read()), pages='all', pandas_options={ 'names': cols }) #.head(20)
df_hs = pd.concat([dfs_hs[0].iloc[5:]] + dfs_hs[1:])
df_hs[df_hs.columns.to_list()[1:]] = df_hs[df_hs.columns.to_list()[1:]]\
.applymap(lambda v: pd.to_numeric(v.replace(",", "") if isinstance(v, str) else v, errors='coerce'))
df_hs['dt'] = df_hs['Date'].map(lambda v: pd.to_datetime(str(v), format='%b-%y', errors='coerce'))
return df_hs
except KeyError:
sleep(1)
return fetchHouston(url, retries-1)
res = !curl -s 'https://porthouston.com/about-us/statistics/' | grep 'TEU-stats'
currentUrl = re.findall('href="([^"]+)"', "".join(res))[0]
#print(currentUrl)
# TODO - automate detection of this url from here: https://porthouston.com/about-us/statistics/
df_hs = fetchHouston(currentUrl)
#df_hs.tail()
doYoyChart(df_hs, metric="Empty Imports", location="Houston")
doYoyChart(df_hs, metric="Empty Exports", location="Houston")
doYoyChart(df_hs, metric="Loaded Total", location="Houston")
def fetchCharleston(url, retries=5):
if retries < 0:
return None
try:
req = Request(url, data=None, headers={ 'User-Agent': uaString })
data = urlopen(req, timeout=15)
dfs_ch = read_pdf(BytesIO(data.read()), pages='all')
# noop
dfs_ch[0]
return dfs_ch
except KeyError:
print("Response code: {}".format(data.getcode()))
sleep(1)
return fetchCharleston(url, retries-1)
except URLError:
sleep(1)
return fetchCharleston(url, retries-1)
dfs_ch = fetchCharleston("http://scspa.com/wp-content/uploads/gl078-pc-teu-history.pdf")
def fixColumns(df_):
firstDataRow = 2
df = df_.copy()
# Standardize column count
if df.shape[1] == 10:
# Standardize rail moves column location
if df.iloc[:, 1].isna().all():
df.iloc[:, 1] = df.iloc[:, 2]
df = df.drop(df.columns[2], axis=1)
res = df.iloc[firstDataRow:].copy()
#print(df.head())
#print(df.iloc[firstDataRow:, 2].values)
# split rail column
_ = df.iloc[firstDataRow:, 2].astype(str).str.split(" ", n = 1, expand = True)
#print(_)
res["RAIL-MOVES"]= _[0]
try:
res["RAIL-RATIO"]= _[1]
except KeyError:
return None
#res = pd.concat([df.iloc[firstDataRow:, 0:4], res.iloc[:, 2:]], axis=1)
# split pier loaded column
_ = df.iloc[firstDataRow:, 3].str.split(" ", n = 2, expand = True)
res["PIER-LOADED-EXPORT"]= _[0] + _[1]
try:
res["PIER-LOADED-IMPORT"]= _[2]
except KeyError:
return None
#res = pd.concat([res, res.iloc[:, -2:]], axis=1)
try:
# split pier empty column
_ = df.iloc[firstDataRow:, 4].str.split(" ", n = 2, expand = True)
res["PIER-EMPTY-EXPORT"]= _[0] + _[1]
res["PIER-EMPTY-IMPORT"]= _[2]
except KeyError:
res["PIER-EMPTY-EXPORT"]= _[0]
res["PIER-EMPTY-IMPORT"]= _[1]
#res["PIER-TOTAL"]= _[2]
#res = pd.concat([df.iloc[firstDataRow:, 0], res.iloc[:, 2:]], axis=1)
# split loaded teu column
_ = df.iloc[firstDataRow:, 6].str.split(" ", n = 2, expand = True)
res["TEU-LOADED-EXPORT"]= _[0] + _[1]
res["TEU-LOADED-IMPORT"]= _[2]
#res = pd.concat([df.iloc[firstDataRow:, 0], df.iloc[firstDataRow:, 4:6], res.iloc[:, 8:10], df.iloc[firstDataRow:, -1]], axis=1)
#print(res)
try:
# split empty teu column
_ = df.iloc[firstDataRow:, 7].str.split(" ", n = 2, expand = True)
res["TEU-EMPTY-EXPORT"]= _[0] + _[1]
res["TEU-EMPTY-IMPORT"]= _[2]
except KeyError:
res["TEU-EMPTY-EXPORT"]= _[0]
res["TEU-EMPTY-IMPORT"]= _[1]
#res = pd.concat([df.iloc[firstDataRow:, 0], df.iloc[firstDataRow:, 4:6], res.iloc[:, 8:], df.iloc[firstDataRow:, -1]], axis=1)
res = pd.concat([res.iloc[:, 0],
res.iloc[:, 1],
res["RAIL-MOVES"],
res["RAIL-RATIO"],
res["PIER-LOADED-EXPORT"],
res["PIER-LOADED-IMPORT"],
res["PIER-EMPTY-EXPORT"],
res["PIER-EMPTY-IMPORT"],
res.iloc[:, 5],
res["TEU-LOADED-EXPORT"],
res["TEU-LOADED-IMPORT"],
res["TEU-EMPTY-EXPORT"],
res["TEU-EMPTY-IMPORT"],
res.iloc[:, 8],
], axis=1)
res.columns = ['DATE',
'RAIL-DRAYS', 'RAIL-MOVES', 'RAIL-RATIO',
'PIER-LOADED-EXPORT', 'PIER-LOADED-IMPORT',
'PIER-EMPTY-EXPORT', 'PIER-EMPTY-IMPORT',
'PIER-TOTAL',
'TEU-LOADED-EXPORT', 'TEU-LOADED-IMPORT',
'TEU-EMPTY-EXPORT', 'TEU-EMPTY-IMPORT',
'TEU-TOTAL']
#print(res)
# fix remaining column labels
#res.columns = ['DATE', 'RAIL-DRAYS', 'RAIL-MOVES'] + res.columns.to_list()[3:-1] + ["TEU-TOTAL"]
# convert all columns to numeric
res.iloc[:, 1:3] = res.iloc[:, 1:3]\
.applymap(lambda v: pd.to_numeric(v.replace(",", "").replace(" ", "") if isinstance(v, str) else v, errors='coerce'))
res.iloc[:, 4:] = res.iloc[:, 4:]\
.applymap(lambda v: pd.to_numeric(v.replace(",", "").replace(" ", "") if isinstance(v, str) else v, errors='coerce'))
res['dt'] = res['DATE'].apply(lambda v: pd.to_datetime(v, format="%B, %Y", errors='coerce'))
return res
#df_ch = pd.concat([fixColumns(d) for d in dfs_ch[:2]])
#df_ch.head()
#doYoyChart(df_ch, metric='TEU-LOADED-EXPORT', location='Charleston')
#doYoyChart(df_ch, metric='TEU-LOADED-IMPORT', location='Charleston')
#doYoyChart(df_ch, metric='TEU-TOTAL', location='Charleston')
# http://www.portofvirginia.com/who-we-are/port-stats/
#raise RuntimeError("Broken")
#result = !curl -sL 'http://www.portofvirginia.com/who-we-are/port-stats/' | grep 'xlsx'
#currentReport = [u for u in re.findall('href="([^"]+)"', "".join(result)) if "xls" in u][0]
baseUrl = "http://www.portofvirginia.com/who-we-are/port-stats/"
def fetchCurrentReport():
driver = webdriver.Chrome(executable_path='/usr/local/bin/chromedriver',
options=chrome_options)
driver.get(baseUrl)
wait = WebDriverWait(driver, 15)
#wait.until(ajaxComplete)
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#container")))
reportLink = "#container > div > section > article > div > ul > li:nth-child(2) > a"
return driver.find_element_by_css_selector(reportLink).get_attribute('href')
currentReport = fetchCurrentReport()
#print(currentReport)
oldUrl = "http://www.portofvirginia.com/excel/Port_of_Virginia_Statistics.xlsx"
url = currentReport #"https://web.archive.org/web/20191218025738if_/http://www.portofvirginia.com/excel/Port_of_Virginia_Statistics.xlsx"
df_va = pd.read_excel(url, skiprows=1)
df_va.columns = ['Month', 'Key'] + df_va.columns.to_list()[2:]
df_va = df_va.iloc[:71]
df_va = df_va.dropna(how='all')
df_va['Month'] = df_va['Month'].ffill(limit=4)
#df_va.head(20)
df_va_melted = df_va.melt(id_vars=['Month', 'Key'])
df_va_melted = df_va_melted[~df_va_melted['value'].isna()]
df_va_melted['dt'] = pd.to_datetime(df_va_melted['Month'] + ' ' + df_va_melted['variable'].apply(str), format="%b %Y")
df_va_clean = df_va_melted.pivot(index='dt', columns='Key', values='value')
doYoyChart(df_va_clean, location='Virgina', metric='Export Empties')
doYoyChart(df_va_clean, location='Virgina', metric='Import Loads')
doYoyChart(df_va_clean, location='Virgina', metric='Export Loads')
doYoyChart(df_va_clean, location='Virgina', metric='Total TEUs')
# https://www.oaklandseaport.com/performance/facts-figures/
df_oakland = pd.read_excel("https://www.oaklandseaport.com/files/PDF/Oakland%20Monthly%20TEUs%201998-2021.xlsx",
skiprows=2)
df_oakland.columns = [re.sub("[^A-z]+", "-", c) for c in df_oakland]
df_oakland['dt'] = pd.to_datetime(df_oakland['Year'].apply(str) + df_oakland['Month'], format="%Y%B", errors='coerce')
df_oakland = df_oakland.dropna()
df_oakland = df_oakland.drop(['Year', 'Month'], axis=1)
df_oakland = df_oakland.set_index('dt')
#df_oakland.tail()
command = "curl -s 'https://www.oaklandseaport.com/performance/facts-figures/' | egrep 'class=\"(number|chart-subtitle)\"'"
df_oak_latest_res = !$command
df_oak_latest_raw = "".join(df_oak_latest_res)
extractMonthAndValue = lambda s: re.findall('title="([A-z]+)"><span class="number">([0-9,.]+)', s)
def getLabel(n):
if n == 0:
return 'Import-Full'
elif n == 1:
return 'Export-Full'
elif n == 2:
return 'Import-Empty'
else:
return 'Export-Empty'
df_oak_latest_stg = [(getLabel(n-1), extractMonthAndValue(v)) for n, v in enumerate(re.split('class="chart-subtitle"', df_oak_latest_raw)) if "number" in v]
currentYear = date.today().year
df_oak_latest = pd.DataFrame()
for kind, vals in df_oak_latest_stg:
if len(vals) == 0:
continue
#print(kind, vals)
tmp = pd.DataFrame.from_records(vals)
tmp['dt'] = pd.to_datetime(f"{currentYear}-" + tmp.iloc[:, 0], format="%Y-%b")
tmp.iloc[:, 1] = pd.to_numeric(tmp.iloc[:, 1].map(lambda v: v.replace(",", "")))
tmp['type'] = tmp.index.map(lambda v: kind)
tmp.columns = ['month', 'value', 'dt', 'type']
df_oak_latest = pd.concat([df_oak_latest, tmp])
df_oak_latest_tab = df_oak_latest.pivot_table(index='dt', columns='type', values='value')
df_oak_latest_tab['Total-Full'] = df_oak_latest_tab['Import-Full'] + df_oak_latest_tab['Export-Full']
df_oak_latest_tab['Total-Empty'] = df_oak_latest_tab['Import-Empty'] + df_oak_latest_tab['Export-Empty']
df_oak_latest_tab['Grand-Total'] = df_oak_latest_tab['Total-Full'] + df_oak_latest_tab['Total-Empty']
#df_oak_latest_tab.head()
df_oak_combined = pd.concat([df_oakland, df_oak_latest_tab]).reset_index()
doYoyChart(df_oak_combined, metric='Total-Full', location='Oakland')
doYoyChart(df_oak_combined, metric='Total-Empty', location='Oakland')
doYoyChart(df_oak_combined, metric='Grand-Total', location='Oakland')