| Overall Statistics |
|
Total Trades 36 Average Win 0.05% Average Loss -0.05% Compounding Annual Return 1.619% Drawdown 0.300% Expectancy 0.119 Net Profit 0.114% Sharpe Ratio 1.474 Probabilistic Sharpe Ratio 54.867% Loss Rate 44% Win Rate 56% Profit-Loss Ratio 1.01 Alpha -0.032 Beta 0.08 Annual Standard Deviation 0.011 Annual Variance 0 Information Ratio -8.995 Tracking Error 0.065 Treynor Ratio 0.202 Total Fees $36.00 |
import json
import pandas as pd
import numpy as np
from io import StringIO
from numpy.fft import fft, ifft
import numba
from talib.abstract import (
DEMA, EMA, MIDPRICE, SMA, T3, TEMA, TRIMA, WMA,
ADX, ADXR, AROONOSC, BOP, CMO, DX, MFI, MINUS_DM, MOM, ROC, RSI,
TRIX , WILLR, ATR, NATR, BBANDS, AROON, STOCHRSI,
HT_TRENDLINE, AD, OBV, HT_DCPERIOD, HT_DCPHASE, HT_TRENDMODE,
TRANGE, AVGPRICE, MEDPRICE, TYPPRICE, WCLPRICE, ULTOSC,
MAMA, SAR, SAREXT, APO, MACD, ADOSC,
HT_PHASOR, HT_SINE, STOCHF, STOCH
)
import mlfinlab as ml
from pipelines import TripleBarierLabeling, OutlierStdRemove
from model_loader import deserialize_random_forest
from statsmodels.tsa.stattools import adfuller
from method_timer import time_method
class CalibratedResistanceAtmosphericScrubbers(QCAlgorithm):
periods = [5, 30, 60, 300, 480, 2400, 12000, 96000]
std_outlier = 10
volatility_lookback = 50
olatility_scaler = 1
tb_triplebar_num_days = 3
tb_triplebar_pt_sl = [1, 1]
tb_triplebar_min_ret = 0.003
rand_state = 3
def Initialize(self):
# date, equity, brokerage and bencmark
self.SetStartDate(2016, 7, 7)
self.SetEndDate(2016, 8, 1) # 2020, 5, 15
self.SetCash(100000)
self.spy = self.AddEquity("SPY", Resolution.Minute, fillDataForward=True).SetDataNormalizationMode(DataNormalizationMode.Adjusted)
self.SetBrokerageModel(BrokerageName.InteractiveBrokersBrokerage, AccountType.Cash)
self.Settings.FreePortfolioValuePercentage = 0.5
self.SetBenchmark("SPY")
# OHLCV init
self.open = pd.Series()
self.high = pd.Series()
self.low = pd.Series()
self.close = pd.Series()
self.volume = pd.Series()
# warmp up period
self.lookback = 96100
self.SetWarmUp(self.lookback)
# ML model
self.model = self.load_model("https://github.com/MislavSag/trademl/blob/master/trademl/modeling/random_forest/rf_model.json?raw=true")
self.model_features = pd.read_csv(StringIO(self.Download('https://raw.githubusercontent.com/MislavSag/trademl/master/trademl/modeling/random_forest/feature_names.csv')), sep=',', index_col=[0])
self.model_features = self.model_features.squeeze()
self.min_d = pd.read_csv(StringIO(self.Download('https://raw.githubusercontent.com/MislavSag/trademl/master/trademl/modeling/random_forest/min_d.csv')), sep=';', names=['feature', 'value'])
self.min_d = self.min_d[1:]
self.min_d = self.min_d.loc[self.min_d['feature'].isin(self.model_features)]
self.min_d.set_index(self.min_d['feature'], inplace=True)
self.stationary_cols = self.min_d['feature'].loc[self.min_d['value'] > 0]
self.min_d = self.min_d['value'].loc[self.min_d['value'] > 0]
# timezone
self.SetTimeZone("Europe/Zagreb")
def load_model(self, url):
model = deserialize_random_forest(json.loads(self.Download(url)))
self.Log("Successfully loaded model")
return model
@time_method
def OnData(self, data):
'''OnData event is the primary entry point for your algorithm. Each new data point will be pumped in here.
Arguments:
data: Slice object keyed by symbol containing the stock data
'''
# if there are no bars data (only stok splits, dividends etc) than cont
if "SPY" not in data.Bars:
return
### GET HISTORICAL OHLCV DATA
open_ = data["SPY"].Open
high_ = data["SPY"].High
low_ = data["SPY"].Low
close_ = data["SPY"].Close
volume_ = data["SPY"].Volume
self.open = self.open.append(pd.Series([open_], index=[self.Time]))[-self.lookback:]
self.high = self.high.append(pd.Series([high_], index=[self.Time]))[-self.lookback:]
self.low = self.low.append(pd.Series([low_], index=[self.Time]))[-self.lookback:]
self.close = self.close.append(pd.Series([close_], index=[self.Time]))[-self.lookback:]
self.volume = self.volume.append(pd.Series([volume_], index=[self.Time]))[-self.lookback:]
# continue if warm up has finished
if self.IsWarmingUp:
return
### CALCULATES EVENTS WHEN TO TRADE
close_stationary = self.frac_diff_ffd(self.close.values, self.min_d.loc['close'])
close_stationary = pd.Series(close_stationary, index=self.close.index)
close_stationary = close_stationary.dropna()
daily_vol = ml.util.get_daily_vol(self.close, lookback=50)
cusum_events = ml.filters.cusum_filter(self.close, threshold=daily_vol.mean()*1)
if cusum_events.empty:
return
if cusum_events[-1] == self.Time:
self.Debug(self.Time)
# create pandas data framr
df = pd.DataFrame({'open': self.open, 'high': self.high, 'low': self.low, 'close': self.close, 'volume': self.volume}, index=self.close.index)
### ADD FEATURES TO OHLCV
# add tecnical indicators
df = self.add_technical_indicators(df, self.periods)
df.columns = [cl[0] if isinstance(cl, tuple) else cl for cl in df.columns]
# add ohlc transformations
df['high_low'] = df ['high'] - df ['low']
df['close_open'] = df ['close'] - df ['open']
# simple momentum
df['mom1'] = df['close'].pct_change(periods=1)
df['mom2'] = df['close'].pct_change(periods=2)
df['mom3'] = df['close'].pct_change(periods=3)
df['mom4'] = df['close'].pct_change(periods=4)
df['mom5'] = df['close'].pct_change(periods=5)
# Volatility
df['volatility_60'] = np.log(df['close']).diff().rolling(
window=60, min_periods=60, center=False).std()
df['volatility_30'] = np.log(df['close']).diff().rolling(
window=30, min_periods=30, center=False).std()
df['volatility_15'] = np.log(df['close']).diff().rolling(
window=15, min_periods=15, center=False).std()
df['volatility_10'] = np.log(df['close']).diff().rolling(
window=10, min_periods=10, center=False).std()
df['volatility_5'] =np.log(df['close']).diff().rolling(
window=5, min_periods=5, center=False).std()
# Serial Correlation (Takes time)
# window_autocorr = 50
# df['autocorr_1'] = np.log(df['close']).diff().rolling(
# window=window_autocorr, min_periods=window_autocorr,
# center=False).apply(lambda x: x.autocorr(lag=1), raw=False)
# df['autocorr_2'] = np.log(df['close']).diff().rolling(
# window=window_autocorr, min_periods=window_autocorr,
# center=False).apply(lambda x: x.autocorr(lag=2), raw=False)
# df['autocorr_3'] = np.log(df['close']).diff().rolling(
# window=window_autocorr, min_periods=window_autocorr,
# center=False).apply(lambda x: x.autocorr(lag=3), raw=False)
# df['autocorr_4'] = np.log(df['close']).diff().rolling(
# window=window_autocorr, min_periods=window_autocorr,
# center=False).apply(lambda x: x.autocorr(lag=4), raw=False)
# df['autocorr_5'] = np.log(df['close']).diff().rolling(
# window=window_autocorr, min_periods=window_autocorr,
# center=False).apply(lambda x: x.autocorr(lag=5), raw=False)
# Skewness
df['skew_60'] = np.log(df['close']).diff().rolling(
window=60, min_periods=60, center=False).skew()
df['skew_30'] = np.log(df['close']).diff().rolling(
window=30, min_periods=30, center=False).skew()
df['skew_15'] = np.log(df['close']).diff().rolling(
window=15, min_periods=15, center=False).skew()
df['skew_10'] = np.log(df['close']).diff().rolling(
window=10, min_periods=10, center=False).skew()
df['skew_5'] =np.log(df['close']).diff().rolling(
window=5, min_periods=5, center=False).skew()
# kurtosis
df['kurtosis_60'] = np.log(df['close']).diff().rolling(
window=60, min_periods=60, center=False).kurt()
df['kurtosis_30'] = np.log(df['close']).diff().rolling(
window=30, min_periods=30, center=False).kurt()
df['kurtosis_15'] = np.log(df['close']).diff().rolling(
window=15, min_periods=15, center=False).kurt()
df['kurtosis_10'] = np.log(df['close']).diff().rolling(
window=10, min_periods=10, center=False).kurt()
df['kurtosis_5'] =np.log(df['close']).diff().rolling(
window=5, min_periods=5, center=False).kurt()
### KEEP ONLY COLUMNS NEEDED FOR MODEL
df = df[self.model_features.to_list()]
### REMOVE NAN VALUES
df = df.dropna()
# MAKES SEIRES STATIONARY
df = self.unstat_cols_to_stat(df, self.min_d, self.stationary_cols)
### PREDICTIONS
# prediction = np.random.choice([0, 1], replace=True, p=[.5, .5])
prediction = self.model.predict(df.iloc[[-1], :])
if self.Securities["SPY"].Invested and prediction == -1:
self.Liquidate("SPY")
elif not self.Securities["SPY"].Invested and prediction == 1:
self.SetHoldings("SPY", .5)
def OnOrderEvent(self, orderEvent):
pass
@time_method
def add_ind(self, ohlcv, f, n, periods):
"""
Add technical indicator to pd.DataFrame
Parameters
----------
f : function
function from ta_lib package.
n : str
Nme prefix.
Returns
-------
pd.Data.Frame.
"""
ind = pd.concat([f(ohlcv, p).rename(n + str(p)) for p in periods],
axis=1)
return ind
@time_method
def add_ind_df(self, ohlcv, f, n, periods):
"""
Add technical indicator to pd.DataFrame when indicator has multiplie
outputs.
Parameters
----------
f : function
function from ta_lib package.
n : str
Nme prefix.
Returns
-------
pd.Data.Frame.
"""
ind = [f(ohlcv, p).add_prefix((f._Function__namestr + '_' + str(p) + '_'))
for p in periods]
# ind = [f(ohlcv, p).
# set_axis((f._Function__namestr + '_' +
# pd.Series(f.output_names) + '_' + str(p)), axis=1)
# for p in periods]
ind = pd.concat(ind, axis=1)
return ind
@time_method
def add_technical_indicators(self, data, periods):
"""Add tecnical indicators as featues.
Arguments:
data {pd.DataFrame} -- Pandas data frame with OHLC data
periods {list} -- List that contain periods as arguments.
Returns:
pd.dataFrame -- Pandas data frame with additional indicators
"""
# add technical indicators for variuos periods when ind has 1 output
indsList = [DEMA, EMA, MIDPRICE, SMA, T3, # MIDPOINT
TEMA, TRIMA, WMA, # KAMA memory intensive!
ADX, ADXR, AROONOSC, BOP, CMO, DX, MFI, MINUS_DM, MOM, ROC, RSI,
TRIX , WILLR, # CCI NE RADI (VALJDA)
ATR, NATR]
inds = [self.add_ind(data, f, f._Function__name.decode('ascii'), periods)
for f in indsList]
inds = pd.concat(inds, axis=1)
data = pd.concat([data, inds], axis=1)
# add technical indicators for variuos periods when ind has multiplie
# outputs
indsList = [BBANDS, AROON, STOCHRSI]
inds = [self.add_ind_df(data, f, f._Function__name.decode('ascii'), periods)
for f in indsList]
inds = pd.concat(inds, axis=1)
data = pd.concat([data, inds], axis=1)
# add tecnical indicators with no arguments
indsList = [HT_TRENDLINE, AD, OBV, HT_DCPERIOD, HT_DCPHASE, HT_TRENDMODE,
TRANGE, AVGPRICE, MEDPRICE, TYPPRICE, WCLPRICE,
ULTOSC]
inds = [f(data).rename(f._Function__name.decode('ascii')) for f in indsList]
inds = pd.concat(inds, axis=1)
data = pd.concat([data, inds], axis=1)
# add other indicators
data[['MAMA', 'FAMA']] = MAMA(data) # MAVP ne radi
data[['MAMA_25', 'FAMA_25']] = MAMA(data, fastlimit=0.25, slowlimit=0.02) # MAVP ne radi
data[['MAMA_5', 'FAMA_5']] = MAMA(data, fastlimit=0.5, slowlimit=0.05) # MAVP ne radi
data['SAR'] = SAR(data)
data['SAR_1'] = SAR(data, acceleration=0.01, maximum=0.01)
data['SAR_2'] = SAR(data, acceleration=0.02, maximum=0.02)
data['SAREXT'] = SAREXT(data)
startvalue, offsetonreverse, accelerationinitlong, accelerationlong,\
accelerationmaxlong, accelerationinitshort, accelerationshort,\
accelerationmaxshort = np.random.uniform(low=0.01, high=0.4, size=8)
data['SAREXT_rand'] = SAREXT(data, startvalue=startvalue,
offsetonreverse=offsetonreverse,
accelerationinitlong=accelerationinitlong,
accelerationlong=accelerationlong,
accelerationmaxlong=accelerationmaxlong,
accelerationinitshort=accelerationinitshort,
accelerationshort=accelerationshort,
accelerationmaxshort=accelerationmaxshort)
data['APO'] = APO(data)
data['APO_1'] = APO(data, fastperiod=24, slowperiod=52, matype=0)
data['APO_2'] = APO(data, fastperiod=50, slowperiod=100, matype=0)
data['APO_3'] = APO(data, fastperiod=100, slowperiod=200, matype=0)
data['APO_4'] = APO(data, fastperiod=200, slowperiod=400, matype=0)
data['APO_5'] = APO(data, fastperiod=12000, slowperiod=24000, matype=0)
data['ADOSC'] = ADOSC(data)
data[['MACD', 'MACDSIGNAL', 'MACDHIST']] = MACD (data)
data[['inphase', 'quadrature']] = HT_PHASOR(data)
data[['sine', 'leadsine']] = HT_SINE(data)
data[['fastk', 'fastd']]= STOCHF(data)
data[['fastk_20', 'fastd_20']]= STOCHF(data, fastk_period=20, fastd_period=9, fastd_matype=0)
data[['fastk_200', 'fastd_200']]= STOCHF(data, fastk_period=200, fastd_period=80, fastd_matype=0)
data[['fastk_3600', 'fastd_3600']]= STOCHF(data, fastk_period=3600, fastd_period=400, fastd_matype=0)
data[['slowk', 'slowd ']]= STOCH(data)
data[['slowk_30', 'slowd_30']]= STOCH(data, fastk_period=30, slowk_period=15,
slowk_matype=0, slowd_period=9, slowd_matype=0)
return data
@time_method
def get_weights(self, d, size):
"""Expanding window fraction difference weights."""
w = [1.0]
for k in range(1, size):
w_ = -w[-1] / k * (d - k + 1)
w.append(w_)
w = np.array(w[::-1]).reshape(-1, 1)
return w
@time_method
#@numba.njit
def get_weights_ffd(self, d, thres, lim=99999):
"""Fixed width window fraction difference weights.
Set lim to be large if you want to only stop at thres.
Set thres to be zero if you want to ignore it.
"""
w = [1.0]
k = 1
for i in range(1, lim):
w_ = -w[-1] / k * (d - k + 1)
if abs(w_) < thres:
break
w.append(w_)
k += 1
w = np.array(w[::-1]).reshape(-1, 1)
return w
@time_method
def frac_diff_ffd(self, x, d, thres=1e-4, lim=None):
assert isinstance(x, np.ndarray)
assert x.ndim == 1
if lim is None:
lim = len(x)
w, out = self._frac_diff_ffd(x, d, lim, thres=thres)
# print(f'weights is shape {w.shape}')
return out
@time_method
#@numba.njit
def _frac_diff_ffd(self, x, d, lim, thres=1e-4):
"""d is any positive real"""
w = self.get_weights_ffd(d, thres, lim)
width = len(w) - 1
output = []
output.extend([np.nan] * width) # the first few entries *were* zero, should be nan?
for i in range(width, len(x)):
output.append(np.dot(w.T, x[i - width: i + 1])[0])
return w, np.array(output)
@time_method
def fast_frac_diff(self, x, d):
"""expanding window version using fft form"""
assert isinstance(x, np.ndarray)
T = len(x)
np2 = int(2 ** np.ceil(np.log2(2 * T - 1)))
k = np.arange(1, T)
b = (1,) + tuple(np.cumprod((k - d - 1) / k))
z = (0,) * (np2 - T)
z1 = b + z
z2 = tuple(x) + z
dx = ifft(fft(z1) * fft(z2))
return np.real(dx[0:T])
@time_method
def test_frac_diff_ffd_equals_original_impl(self, d=3):
from .prado_orig import fracDiff_FFD_original_impl
import pandas as pd
x = np.random.randn(100)
a = self.frac_diff_ffd(x, d, thres=1e-4)
b = fracDiff_FFD_original_impl(pd.DataFrame(x), d, thres=1e-4)
assert np.allclose(a, b)
# return locals()
@time_method
def test_fast_frac_diff_equals_fracDiff_original_impl(self, d=3):
from .prado_orig import fracDiff_original_impl
import pandas as pd
x = np.random.randn(100)
a = fast_frac_diff(x, d)
b = fracDiff_original_impl(pd.DataFrame(x), d, thres=None)
b = b.values
assert a.shape == b.shape
assert np.allclose(a, b)
# return locals()
@time_method
def min_ffd_value(self, unstationary_series, d_domain, pvalue_threshold=0.05):
"""
Source: Chapter 5, AFML (section 5.5, page 83);
Minimal value of d which makes pandas series stationary.
References:
https://www.wiley.com/en-us/Advances+in+Financial+Machine+Learning-p-9781119482086
https://wwwf.imperial.ac.uk/~ejm/M3S8/Problems/hosking81.pdf
Constant width window (new solution)
Note 1: thresh determines the cut-off weight for the window
Note 2: diff_amt can be any positive fractional, not necessarity bounded [0, 1].
:param unstationary_series: (pd.Series)
:param d_domain: (np.array) numpy linspace; possible d values
:param pvalue_threshold: (float) ADF p-value threshold above which nonstationary
:return: (float) minimum value of d which makes series stationary
"""
d_min = None
for d_i in d_domain:
# resaample series to daily frequency
df1 = unstationary_series.resample('1D').last()
df1.dropna(inplace=True)
df1 = df1.squeeze()
# fracDiff for d
df2 = self.frac_diff_ffd(df1.values, d=d_i, thres=1e-4, lim=None)
df2 = pd.Series(df2, index=df1.index).dropna()
# ADF test
df2 = adfuller(df2.squeeze(), maxlag=1, regression='c', autolag=None)
# if p-value is grater than threshold stop and return d
if df2[1] <= pvalue_threshold:
d_min = d_i
break
return d_min
@time_method
def unstat_cols_to_stat(self, data, min_d, stationaryCols):
"""
Convert unstationary columns to stationary.
:param data: (pd.DataFrame) Pandas DF with unstationary columns.
:return: (pd.DataFrame) Pandas DF with stationary columns.
"""
# make stationary spy
dataStationary = data[stationaryCols].loc[:, min_d > 0]
diff_amt_args = min_d[min_d > 0].to_list()
for i, col in enumerate(dataStationary.columns):
dataStationary[col] = self.frac_diff_ffd(dataStationary[col].values, diff_amt_args[i])
# add stationry spy to spy
columnsToChange = data[stationaryCols].loc[:, min_d > 0].columns
data[columnsToChange] = dataStationary
data.dropna(inplace=True)
return dataimport time
from functools import wraps
def time_method(func):
@wraps(func)
def timed(*args, **kw):
time_thresh = 1 # Function time taken printed if greater than this number
ts = time.time()
result = func(*args, **kw)
te = time.time()
if te - ts > time_thresh:
algo = args[0]
algo.Debug("%r took %2.2f seconds to run." % (func.__name__, te - ts))
return result
return timed# Your New Python Fileimport pandas as pd
import numpy as np
import mlfinlab as ml
import pandas as pd
class CalibratedResistanceAtmosphericScrubbers(QCAlgorithm):
def Initialize(self):
self.SetStartDate(2019, 1, 1) # Set Start Date
self.SetEndDate(2019, 3, 1)
self.SetCash(100000) # Set Strategy Cash
self.spy = self.AddEquity("SPY", Resolution.minute)
self.spy.SetDataNormalizationMode(DataNormalizationMode.Adjusted) # Raw, SplitAdjusted, TotalReturn
self.SetBrokerageModel(BrokerageName.InteractiveBrokersBrokerage, AccountType.Cash)
# init close prices
self.open = np.array([])
self.high = np.array([])
self.low = np.array([])
self.close = np.array([])
self.volume = np.array([])
self.lookback = max(self.periods)
self.SetWarmUp(self.lookback * 2)
def OnData(self, data):
'''OnData event is the primary entry point for your algorithm. Each new data point will be pumped in here.
Arguments:
data: Slice object keyed by symbol containing the stock data
'''
if "SPY" not in data.Bars:
return
open_ = data["SPY"].Open
high_ = data["SPY"].High
low_ = data["SPY"].Low
close_ = data["SPY"].Close
volume_ = data["SPY"].Volume
self.open = np.append(self.open, close_)[-self.lookback*2:]
self.high = np.append(self.high, close_)[-self.lookback*2:]
self.low = np.append(self.low, close_)[-self.lookback*2:]
self.close = np.append(self.close, close_)[-self.lookback*2:]
self.volume = np.append(self.volume, close_)[-self.lookback*2:]
self.time = self.Time
if self.IsWarmingUp:
return
df = pd.DataFrame({'open': self.open, 'high': self.high, 'low': self.low, 'close': self.close, 'volume': self.volume})
# HERE I SHOULD SOMEHOW CREATE INDEX VECTOR WITH FOR DF WITH ALL PASSED CLOSE PRICES
# Compute volatility - THATS THE FUNCTION I NEED TO APPLY INE EVERY STEP
daily_vol = ml.util.get_daily_vol(self.close, lookback=self.volatility_lookback)import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import mlfinlab as ml
class TripleBarierLabeling(BaseEstimator, TransformerMixin):
def __init__(self, close_name='close', volatility_lookback=50,
volatility_scaler=1, triplebar_num_days=3,
triplebar_pt_sl=[1, 1], triplebar_min_ret=0.003,
num_threads=1):
# hyperparameters for all functions
self.close_name = close_name
self.volatility_lookback = volatility_lookback
self.volatility_scaler = volatility_scaler
self.triplebar_num_days = triplebar_num_days
self.triplebar_pt_sl = triplebar_pt_sl
self.triplebar_min_ret = triplebar_min_ret
self.num_threads = num_threads
def fit(self, X, y=None):
# extract close series
close = X.loc[:, self.close_name]
# Compute volatility
daily_vol = ml.util.get_daily_vol(
close,
lookback=self.volatility_lookback)
# Apply Symmetric CUSUM Filter and get timestamps for events
cusum_events = ml.filters.cusum_filter(
close,
threshold=daily_vol.mean()*self.volatility_scaler)
# Compute vertical barrier
vertical_barriers = ml.labeling.add_vertical_barrier(
t_events=cusum_events,
close=close,
num_days=self.triplebar_num_days)
# tripple barier events
triple_barrier_events = ml.labeling.get_events(
close=close,
t_events=cusum_events,
pt_sl=self.triplebar_pt_sl,
target=daily_vol,
min_ret=self.triplebar_min_ret,
num_threads=self.num_threads,
vertical_barrier_times=vertical_barriers)
# labels
labels = ml.labeling.get_bins(triple_barrier_events, close)
labels = ml.labeling.drop_labels(labels)
# merge labels and triple barrier events
self.triple_barrier_info = pd.concat([triple_barrier_events.t1, labels], axis=1)
self.triple_barrier_info.dropna(inplace=True)
return self
def transform(self, X, y=None):
# subsample
X = X.reindex(self.triple_barrier_info.index)
return X
class OutlierStdRemove(BaseEstimator, TransformerMixin):
def __init__(self, std_threshold):
self.std_threshold = std_threshold
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X[X.apply(lambda x: np.abs(x - x.mean()) / x.std() < self.std_threshold).
all(axis=1)]
return X
### TESTS
# DATA_PATH = 'C:/Users/Mislav/algoAItrader/data/spy_with_vix.h5'
# df = pd.read_hdf(DATA_PATH, start=0, stop=4000)
# ### HYPER PARAMETERS
# std_outlier = 10
# tb_volatility_lookback = 50
# tb_volatility_scaler = 1
# tb_triplebar_num_days = 3
# tb_triplebar_pt_sl = [1, 1]
# tb_triplebar_min_ret = 0.003
# # triple barrier alone
# triple_barrier_pipe= TripleBarierLabeling(
# close_name='close_orig',
# volatility_lookback=tb_volatility_lookback,
# volatility_scaler=tb_volatility_scaler,
# triplebar_num_days=tb_triplebar_num_days,
# triplebar_pt_sl=tb_triplebar_pt_sl,
# triplebar_min_ret=tb_triplebar_min_ret,
# num_threads=1
# )
# tb_fit = triple_barrier_pipe.fit(df)
# tb_fit.triple_barrier_info
# X = triple_barrier_pipe.transform(df)
# #
# pipeline = Pipeline([
# ('remove_outlier', OutlierStdRemove(10)),
# ('triple_barrier_labeling', TripleBarierLabeling(close_name='close_orig')),
# ])
# pipe_out = pipeline.fit_transform(df)import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree._tree import Tree
from sklearn.ensemble import RandomForestClassifier
def serialize_tree(tree):
serialized_tree = tree.__getstate__()
dtypes = serialized_tree['nodes'].dtype
serialized_tree['nodes'] = serialized_tree['nodes'].tolist()
serialized_tree['values'] = serialized_tree['values'].tolist()
return serialized_tree, dtypes
def deserialize_tree(tree_dict, n_features, n_classes, n_outputs):
tree_dict['nodes'] = [tuple(lst) for lst in tree_dict['nodes']]
names = ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples']
tree_dict['nodes'] = np.array(tree_dict['nodes'], dtype=np.dtype({'names': names, 'formats': tree_dict['nodes_dtype']}))
tree_dict['values'] = np.array(tree_dict['values'])
tree = Tree(n_features, np.array([n_classes], dtype=np.intp), n_outputs)
tree.__setstate__(tree_dict)
return tree
def serialize_decision_tree(model):
tree, dtypes = serialize_tree(model.tree_)
serialized_model = {
'meta': 'decision-tree',
'feature_importances_': model.feature_importances_.tolist(),
'max_features_': model.max_features_,
'n_classes_': int(model.n_classes_),
'n_features_': model.n_features_,
'n_outputs_': model.n_outputs_,
'tree_': tree,
'classes_': model.classes_.tolist(),
'params': model.get_params()
}
tree_dtypes = []
for i in range(0, len(dtypes)):
tree_dtypes.append(dtypes[i].str)
serialized_model['tree_']['nodes_dtype'] = tree_dtypes
return serialized_model
def deserialize_decision_tree(model_dict):
deserialized_model = DecisionTreeClassifier(**model_dict['params'])
deserialized_model.classes_ = np.array(model_dict['classes_'])
deserialized_model.max_features_ = model_dict['max_features_']
deserialized_model.n_classes_ = model_dict['n_classes_']
deserialized_model.n_features_ = model_dict['n_features_']
deserialized_model.n_outputs_ = model_dict['n_outputs_']
tree = deserialize_tree(model_dict['tree_'], model_dict['n_features_'], model_dict['n_classes_'], model_dict['n_outputs_'])
deserialized_model.tree_ = tree
return deserialized_model
def serialize_random_forest(model):
serialized_model = {
'meta': 'rf',
'max_depth': model.max_depth,
'min_samples_split': model.min_samples_split,
'min_samples_leaf': model.min_samples_leaf,
'min_weight_fraction_leaf': model.min_weight_fraction_leaf,
'max_features': model.max_features,
'max_leaf_nodes': model.max_leaf_nodes,
'min_impurity_decrease': model.min_impurity_decrease,
'min_impurity_split': model.min_impurity_split,
'n_features_': model.n_features_,
'n_outputs_': model.n_outputs_,
'classes_': model.classes_.tolist(),
'estimators_': [serialize_decision_tree(decision_tree) for decision_tree in model.estimators_],
'params': model.get_params()
}
if 'oob_score_' in model.__dict__:
serialized_model['oob_score_'] = model.oob_score_
if 'oob_decision_function_' in model.__dict__:
serialized_model['oob_decision_function_'] = model.oob_decision_function_.tolist()
if isinstance(model.n_classes_, int):
serialized_model['n_classes_'] = model.n_classes_
else:
serialized_model['n_classes_'] = model.n_classes_.tolist()
return serialized_model
def deserialize_random_forest(model_dict):
model = RandomForestClassifier(**model_dict['params'])
estimators = [deserialize_decision_tree(decision_tree) for decision_tree in model_dict['estimators_']]
model.estimators_ = np.array(estimators)
model.classes_ = np.array(model_dict['classes_'])
model.n_features_ = model_dict['n_features_']
model.n_outputs_ = model_dict['n_outputs_']
model.max_depth = model_dict['max_depth']
model.min_samples_split = model_dict['min_samples_split']
model.min_samples_leaf = model_dict['min_samples_leaf']
model.min_weight_fraction_leaf = model_dict['min_weight_fraction_leaf']
model.max_features = model_dict['max_features']
model.max_leaf_nodes = model_dict['max_leaf_nodes']
model.min_impurity_decrease = model_dict['min_impurity_decrease']
model.min_impurity_split = model_dict['min_impurity_split']
if 'oob_score_' in model_dict:
model.oob_score_ = model_dict['oob_score_']
if 'oob_decision_function_' in model_dict:
model.oob_decision_function_ = model_dict['oob_decision_function_']
if isinstance(model_dict['n_classes_'], list):
model.n_classes_ = np.array(model_dict['n_classes_'])
else:
model.n_classes_ = model_dict['n_classes_']
return model