The following code show how a "backtestable" Dow-Universe could be automatically constructed based on wikipedia:
import pandas as pd
from bs4 import BeautifulSoup
from mediawikiapi import MediaWikiAPI
def wikitable_to_dataframe(table):
"""
Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning:
multirow and multicolumn should format as expected.
"""
rows=table.findAll("tr")
nrows=len(rows)
ncols=max([len(r.findAll(['th','td'])) for r in rows])
# preallocate table structure
# (this is required because we need to move forward in the table
# structure once we've found a row span)
data=[]
for i in range(nrows):
rowD=[]
for j in range(ncols):
rowD.append('')
data.append(rowD)
# fill the table with data:
# move across cells and use span to fill extra cells
for i,row in enumerate(rows):
cells = row.findAll(["td","th"])
for j,cell in enumerate(cells):
cspan=int(cell.get('colspan',1))
rspan=int(cell.get('rowspan',1))
l = 0
for k in range(rspan):
# Shifts to the first empty cell of this row
# Avoid replacing previously insterted content
while data[i+k][j+l]:
l+=1
for m in range(cspan):
data[i+k][j+l+m]+=cell.text.strip("\n")
return pd.DataFrame(data)
mediawikiapi = MediaWikiAPI()
test_page = mediawikiapi.page("Historical components of the Dow Jones Industrial Average")
# to check page URL:
# print(test_page.url)
soup = BeautifulSoup(test_page.html(), 'html.parser')
tables = soup.findAll("table", { "class" : "wikitable" })
df_test = wikitable_to_dataframe(tables[1])
print(df_test.head())
import pandas as pd
from bs4 import BeautifulSoup
from mediawikiapi import MediaWikiAPI
def wikitable_to_dataframe(table):
"""
Exports a Wikipedia table parsed by BeautifulSoup. Deals with spanning:
multirow and multicolumn should format as expected.
"""
rows=table.findAll("tr")
nrows=len(rows)
ncols=max([len(r.findAll(['th','td'])) for r in rows])
# preallocate table structure
# (this is required because we need to move forward in the table
# structure once we've found a row span)
data=[]
for i in range(nrows):
rowD=[]
for j in range(ncols):
rowD.append('')
data.append(rowD)
# fill the table with data:
# move across cells and use span to fill extra cells
for i,row in enumerate(rows):
cells = row.findAll(["td","th"])
for j,cell in enumerate(cells):
cspan=int(cell.get('colspan',1))
rspan=int(cell.get('rowspan',1))
l = 0
for k in range(rspan):
# Shifts to the first empty cell of this row
# Avoid replacing previously insterted content
while data[i+k][j+l]:
l+=1
for m in range(cspan):
data[i+k][j+l+m]+=cell.text.strip("\n")
return pd.DataFrame(data)
mediawikiapi = MediaWikiAPI()
test_page = mediawikiapi.page("Historical components of the Dow Jones Industrial Average")
# to check page URL:
# print(test_page.url)
soup = BeautifulSoup(test_page.html(), 'html.parser')
tables = soup.findAll("table", { "class" : "wikitable" })
df_test = wikitable_to_dataframe(tables[1])
print(df_test.head())
The snippet could be used for the construction of all kinds of Universes.
Beautiful soup however, does not seem to be available on QC-machines. Is there any way to change that?