Backtest

Overall Statistics
Total Orders 473 Average Win 0.59% Average Loss -0.52% Compounding Annual Return 2.478% Drawdown 8.900% Expectancy 0.161 Start Equity 1000000 End Equity 1206189.35 Net Profit 20.619% Sharpe Ratio -0.072 Sortino Ratio -0.075 Probabilistic Sharpe Ratio 0.882% Loss Rate 45% Win Rate 55% Profit-Loss Ratio 1.13 Alpha -0.024 Beta 0.216 Annual Standard Deviation 0.065 Annual Variance 0.004 Information Ratio -0.697 Tracking Error 0.133 Treynor Ratio -0.022 Total Fees $3502.29 Estimated Strategy Capacity $68000000.00 Lowest Capacity Asset LRCX R735QTJ8XC9X Portfolio Turnover 1.52%

# region imports
from AlgorithmImports import *
# endregion


class EmpiricalCumulativeDensityFunction(PythonIndicator):

    def __init__(self, roc_period, lookback_period):
        self.name = f'ECDF_{roc_period}_{lookback_period}'
        self.warm_up_period = lookback_period
        self.time = datetime.min
        self.value = 0
        self.current = IndicatorDataPoint(self.time, self.value)
        self._roc = RateOfChange(roc_period)
        self._roc.name = 'ECDF_ROC'
        self._returns = np.array([])

    def update(self, t, price):
        self.time = t
        if not self._roc.update(t, price):
            self.is_ready2 = False
            return 
        roc = self._roc.current.value
        if len(self._returns) < self.warm_up_period:
            self._returns = np.append(self._returns, roc)
            self.is_ready2 = False
            return
        if roc > 0:
            denominator = len(self._returns[self._returns >= 0])
            self.value = len(self._returns[self._returns >= roc]) / denominator if denominator else 0
        else:
            denominator = len(self._returns[self._returns <= 0])
            self.value = len(self._returns[self._returns <= roc]) / denominator if denominator else 0
        self._returns = np.append(self._returns, roc)[1:]
        self.is_ready2 = True
        self.current = IndicatorDataPoint(t, self.value)

# region imports
from AlgorithmImports import *

from symbol_data import SymbolData

import xgboost as xgb
from sklearn.preprocessing import StandardScaler
# endregion

# Sources:
# - https://www.quantitativo.com/p/a-mean-reversion-strategy-from-first
# - https://www.quantitativo.com/p/machine-learning-and-the-probability
# - https://www.quantitativo.com/p/long-and-short-mean-reversion-machine

# TODO: 
#  - Factor pre-processing
#  - Add more factors
#  - Change QPI lookback to 5 years?
#  - Limit factor history to x years
#  - Add liquidity constraint?
#  - Change ETF to Russel 3000
# pt 2:
#  - Use 20 max_universe_size instead of 10? 
#  - Add stop-loss and time-limit exit? 
#  - Add the short side? w Vix regime filter to adjust exposure on each side.

class ProbabilityOfBouncingBackAlgorithm(QCAlgorithm):

    def initialize(self):
        self.set_start_date(2017, 1, 1)
        self.set_cash(1_000_000)

        self.add_universe(self._get_asset_prices)
        etf = Symbol.create('QQQ', SecurityType.EQUITY, Market.USA)
        self._universe = self.add_universe(self.universe.etf(etf, universe_filter_func=self._select_assets))
        
        self.train(self.date_rules.year_start(), self.time_rules.midnight, self._train_model)
        self.schedule.on(self.date_rules.every_day(etf), self.time_rules.after_market_open(etf, 1), self._trade)
        self.set_warm_up(self.start_date - datetime(2015, 1, 1)) # Min start date for QQQ
        
        self._symbol_data_by_symbol = {}
        self._scaler = StandardScaler()
        self._max_universe_size = self.get_parameter('max_universe_size', 10)
        self._ecdf_threshold = self.get_parameter('ecdf_thresholdobability_threshold', 0.15)
        self._probability_threshold = self.get_parameter('probability_threshold', 0.6)

    def _get_asset_prices(self, fundamentals):
        # Save the current price of the assets so we can update the factors in _select_assets.
        self._price_by_symbol = {f.symbol: f.price for f in fundamentals}
        return []

    def _select_assets(self, constituents):
        # Create SymbolData objects for assets that just entered the ETF.
        etf_symbols = [c.symbol for c in constituents]
        new_symbols = []
        for symbol in etf_symbols:
            if symbol not in self._symbol_data_by_symbol:
                new_symbols.append(symbol)
                self._symbol_data_by_symbol[symbol] = SymbolData()
        # Warm up the factors for assets that just entered the ETF (or we haven't seen yet).
        for bars in self.history[TradeBar](new_symbols, 300, Resolution.DAILY):  ### TODO: Ensure this lookback period is sufficient in case an asset leave the ETF for a while.
            for symbol, bar in bars.items():
                self._symbol_data_by_symbol[symbol].update(bar.end_time, bar.close, True)
        # Update the factors for the rest of the assets we're tracking.
        for symbol, symbol_data in self._symbol_data_by_symbol.items():
            if symbol not in new_symbols and symbol in self._price_by_symbol: 
                self._symbol_data_by_symbol[symbol].update(self.time, self._price_by_symbol[symbol], symbol in etf_symbols)
        
        if self.is_warming_up:
            return []

        # Select a subset of the current ETF constituents.
        probability_by_symbol = {}
        for c in constituents:
            # Filter 1: price >= $1
            if c.symbol not in self._price_by_symbol or not self._price_by_symbol[c.symbol] >= 1:
                continue
            symbol_data = self._symbol_data_by_symbol[c.symbol]
            # Filter 2: Factor values are ready.
            if not symbol_data.is_ready:
                continue
            # Filter 3: ROC(1) < 0 and 3-day ECDF < 15.
            if not (symbol_data.ecdf._roc.current.value < 0 and symbol_data.ecdf.value < self._ecdf_threshold):
                continue
            # Filter 4: P(bouncing back) > 60%
            raw_factors = symbol_data.factor_history.iloc[-1].drop(['in_etf', 'ECDF_ROC']).values.reshape(1, -1)
            p = self._model.predict(xgb.DMatrix(self._scaler.transform(raw_factors)))[0]
            if p > self._probability_threshold:
                probability_by_symbol[c.symbol] = p
        
        self.plot('Universe', 'Size', len(probability_by_symbol))
        
        # Return <=10 assets with the greatest P(bouncing back).
        return [symbol for symbol, _ in sorted(probability_by_symbol.items(), key=lambda x: x[1])[-self._max_universe_size:]]

    def on_warmup_finished(self):
        self._train_model()
    
    def _train_model(self):
        if self.is_warming_up:
            return
        
        # Get training samples.
        factors = []
        labels = []
        for symbol, symbol_data in self._symbol_data_by_symbol.items():
            if not symbol_data.is_ready:
                continue
            # Select samples that have `in_etf`, `ECDF_3_252` < 15, and ROC(1) < 0.
            factor_history = symbol_data.factor_history[
                (symbol_data.factor_history['in_etf']) & 
                (symbol_data.factor_history['ECDF_3_252'] < self._ecdf_threshold) & 
                (symbol_data.factor_history['ECDF_ROC'] < 0)
            ].dropna().drop(['in_etf', 'ECDF_ROC'], axis=1)
            # Align this asset's factor and labels.
            target_history = symbol_data.target_history
            idx = sorted(list(set(target_history.index).intersection(set(factor_history.index))))
            factor_history = factor_history.loc[idx]
            target_history = target_history.loc[idx]
            # Append this asset's factors and labels to the total set of factors/labels.
            if not (factor_history.empty or target_history.empty):
                factors.extend(factor_history.values.tolist())
                labels.extend(target_history.values.tolist())
        factors = np.array(factors)
        labels = np.array(labels)

        # Apply pre-processing to the factors.
        factors = self._scaler.fit_transform(factors)

        # Train the model.
        self._model = xgb.train(
            {
                'booster': 'gbtree',
                'colsample_bynode': 0.8,
                'learning_rate': 0.1,
                'lambda': 0.1,
                'max_depth': 5,
                'num_parallel_tree': 100,
                'objective': 'binary:logistic',
                'subsample': 0.8,
            }, 
            xgb.DMatrix(factors, label=labels), 
            num_boost_round=2
        )

    def _trade(self):
        if not self._universe.selected:
            return
        targets = [PortfolioTarget(symbol, 1/self._max_universe_size) for symbol in self._universe.selected]
        self.set_holdings(targets, True)

# region imports
from AlgorithmImports import *

from ecdf import EmpiricalCumulativeDensityFunction
# endregion


class SymbolData:

    # For features, I used: (I ended up with 16 features)
    # - Rates of change for different windows (short, mid, and long terms, up to a year);
    # - RSIs for different windows;
    # - QPIs for different windows;
    # - IBS, Normalized ATR;
    # - Closing price distance to 200-day SMA;
    # - Turnover;
    # - Hurst exponent (I love this indicator and probably will write something especially for it in the future).

    # For features such as Turnover, I computed its relative value vs. past (time series) and relative value vs. all other stocks every single day (cross-sectional);
    # For some other features, it makes sense only to compute its relative value vs. all other stocks every single day (cross-sectional);

    # For the target, we set 1 if the stock bounced back within 5 days (positive return) and 0 otherwise (further negative return).

    def __init__(self):
        # Define features.
        self.ecdf = EmpiricalCumulativeDensityFunction(3, 252) # Add more windows
        ecdf_indicators = [EmpiricalCumulativeDensityFunction(3, 21*i) for i in [3, 6, 9]]
        self.roc = RateOfChange(21)                       # Add more windows  ---> CS
        roc_indicators = [RateOfChange(21*i) for i in [3, 6, 9, 12]]
        self.rsi = RelativeStrengthIndex(14)              # Add more windows
        rsi_indicators = [RelativeStrengthIndex(21*i) for i in [3, 6, 9, 12]]
        self._sma = SimpleMovingAverage(200)
        self._price = Identity('price')
        self.sma_distance = IndicatorExtensions.over(self._sma, self._price)  #  ---> CS
        #self.ibs = 0 # normalized by ATR
        #self.turnover = 0
        #self.hurst_exponent = 0

        self.factor_history = pd.DataFrame()

        self.factors_to_update = [self.ecdf, self.roc, self.rsi, self._sma, self._price]
        self.factors_to_record = [self.ecdf, self.ecdf._roc, self.roc, self.rsi, self.sma_distance]
        self.factors_to_update.extend(ecdf_indicators)
        self.factors_to_update.extend(roc_indicators)
        self.factors_to_update.extend(rsi_indicators)
        self.factors_to_record.extend(ecdf_indicators)
        self.factors_to_record.extend(roc_indicators)
        self.factors_to_record.extend(rsi_indicators)

        self.factors = []

        self._prices = pd.Series()

    def update(self, t, price, in_etf):
        self.factor_history.loc[t, 'in_etf'] = in_etf

        # Calculate the latest factor values give the new price.
        for factor in self.factors_to_update:
            factor.update(t, price)
        for factor in self.factors_to_record:
            if factor.is_ready or (hasattr(factor, 'is_ready2') and factor.is_ready2):
                self.factor_history.loc[t, factor.name] = factor.current.value
        
        # Calculate the latest label values give the new price.
        self._prices.loc[t] = price
        self.target_history = (self._prices.shift(-5) > self._prices).iloc[:-5].astype(int)
    
    @property
    def is_ready(self):
        return all([factor.is_ready or (hasattr(factor, 'is_ready2') and factor.is_ready2) for factor in self.factors_to_update])