Help with array length in historical dataframe

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime
from datetime import timedelta
import pandas as pd
import numpy as np

class TransdimensionalTachyonCompensator(QCAlgorithm):
    month = 0
    def Initialize(self):
        self.SetStartDate(2000, 1, 1)  # Set Start Date
        # self.SetEndDate(2020, 6, 1)
        self.SetCash(100000)  # Set Strategy Cash
        self.trading_symbols = []
        self.UniverseSettings.Resolution = Resolution.Minute
        self.AddUniverse(self.CoarseSelectionFunction) 
        # self.SetSecurityInitializer(lambda x: x.SetMarketPrice(self.GetLastKnownPrice(x)))
        self.lookback_period = 20 
        self.spy = self.AddEquity("SPY").Symbol
        
        # self.Train(self.DateRules.MonthStart(), self.TimeRules.AfterMarketOpen(self.spy, 0), self.train)
        self.Schedule.On(self.DateRules.MonthStart(), self.TimeRules.AfterMarketOpen(self.spy, 15), self.train)
        # self.Schedule.On(self.DateRules.EveryDay(), self.TimeRules.AfterMarketOpen(self.spy, 30), self.invest)

    def avg_true_range(self, df): 
      ind = range(0,len(df))
      indexlist = list(ind)
      df.index = indexlist
    
      for index, row in df.iterrows():
        if index != 0:
          tr1 = row["high"] - row["low"]
          tr2 = abs(row["high"] - df.iloc[index-1]["close"])
          tr3 = abs(row["low"] - df.iloc[index-1]["close"])
    
          true_range = max(tr1, tr2, tr3)
          df.set_value(index,"True Range", true_range)
    
      df["Avg TR"] = df["True Range"].rolling(min_periods=14, window=14, center=False).mean()
      
      return df        

    def CoarseSelectionFunction(self, coarse):
        if self.Time.month == self.month:
            return Universe.Unchanged
        self.month = self.Time.month
        sortedByDollarVolume = sorted(coarse, key=lambda x: x.DollarVolume, reverse=True)
        self.trading_symbols = [ x.Symbol for x in sortedByDollarVolume if x.HasFundamentalData ][:20]
        return self.trading_symbols
        
    def train(self):
        if not self.trading_symbols: return
    
        for symbol in self.trading_symbols:
            self.AddEquity(symbol, Resolution.Daily)
        
        today = self.Time
        
        price_history = self.History(self.trading_symbols, self.lookback_period, Resolution.Daily)
        
        symbol1 = [self.trading_symbols[0] for i in range(self.lookback_period)]
        symbol2 = [self.trading_symbols[1] for i in range(self.lookback_period)]
        symbol3 = [self.trading_symbols[2] for i in range(self.lookback_period)]
        symbol4 = [self.trading_symbols[3] for i in range(self.lookback_period)]
        symbol5 = [self.trading_symbols[4] for i in range(self.lookback_period)]
        symbol6 = [self.trading_symbols[5] for i in range(self.lookback_period)]
        symbol7 = [self.trading_symbols[6] for i in range(self.lookback_period)]
        symbol8 = [self.trading_symbols[7] for i in range(self.lookback_period)]
        symbol9 = [self.trading_symbols[8] for i in range(self.lookback_period)]
        symbol10 = [self.trading_symbols[9] for i in range(self.lookback_period)]
        symbol11 = [self.trading_symbols[10] for i in range(self.lookback_period)]
        symbol12 = [self.trading_symbols[11] for i in range(self.lookback_period)]
        symbol13 = [self.trading_symbols[12] for i in range(self.lookback_period)]
        symbol14 = [self.trading_symbols[13] for i in range(self.lookback_period)]
        symbol15 = [self.trading_symbols[14] for i in range(self.lookback_period)]
        symbol16 = [self.trading_symbols[15] for i in range(self.lookback_period)]
        symbol17 = [self.trading_symbols[16] for i in range(self.lookback_period)]
        symbol18 = [self.trading_symbols[17] for i in range(self.lookback_period)]
        symbol19 = [self.trading_symbols[18] for i in range(self.lookback_period)]
        symbol20 = [self.trading_symbols[19] for i in range(self.lookback_period)]
        
        close_list = []
        open_list = []
        high_list = []
        low_list = []
        volume_list = []
        
        self.chandelier_long = []
        self.chandelier_short = []
        
        date_list = [ str(price_history.index[i][1]).split(" ")[0] for i in range(price_history.shape[0]) ]
        
        for i in range(price_history.shape[0]):
            close_list.append(price_history.close.iloc[i])
            open_list.append(price_history.open.iloc[i])
            high_list.append(price_history.high.iloc[i])
            low_list.append(price_history.low.iloc[i])
            volume_list.append(price_history.volume.iloc[i])
        
        d = {'symbol': symbol1 + symbol2 + symbol3 + symbol4 + symbol5 + symbol6 + symbol7 + symbol8 + symbol9 \
        + symbol10 + symbol11 + symbol12 + symbol13 + symbol14 + symbol15 + symbol16 + symbol17 + symbol18 + \
        symbol19 + symbol20,
            'datetime': date_list,
            'close': close_list,
            'high': high_list,
            'low': low_list,
            'open': open_list,
            'volume': volume_list
        }
        self.price_data = pd.DataFrame(data=d)
        
        # sort the values by symbol and then date
        # self.price_data.sort_values(by = ['datetime'], inplace = True)
        
        # calculate the change in price
        self.price_data['change_in_price'] = self.price_data['close'].diff()

        # identify rows where the symbol changes
        mask = self.price_data['symbol'] != self.price_data['symbol'].shift(1)
        
        # For those rows, let's make the value null
        self.price_data['change_in_price'] = np.where(mask == True, np.nan, self.price_data['change_in_price'])
        
        # print the rows that have a null value, should have 20
        self.price_data[self.price_data.isna().any(axis = 1)]
        
        # Calculate the 14 day RSI
        n = 14
        
        # First make a copy of the data frame twice
        up_df, down_df = self.price_data[['symbol','change_in_price']].copy(), self.price_data[['symbol','change_in_price']].copy()
        
        # For up days, if the change is less than 0 set to 0.
        # up_df.loc['change_in_price'] = up_df.loc[(up_df['change_in_price'] < 0), 'change_in_price'] = 0
        
        # For down days, if the change is greater than 0 set to 0.
        # down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price'] > 0), 'change_in_price'] = 0
        
        up_df['change_in_price'] = up_df['change_in_price'].apply(lambda x: max(x, 0))
        
        down_df['change_in_price'] = down_df['change_in_price'].apply(lambda x: min(0, x))

        # We need change in price to be absolute.
        down_df['change_in_price'] = down_df['change_in_price'].abs()
        
        # self.price_data["RSI"] = self.RSI("SPY", 14, MovingAverageType.Simple)

        # Calculate the EWMA (Exponential Weighted Moving Average), meaning older values are given less weight compared to newer values.
        ewma_up = up_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
        ewma_down = down_df.groupby('symbol')['change_in_price'].transform(lambda x: x.ewm(span = n).mean())
        
        # Calculate the Relative Strength
        relative_strength = ewma_up / ewma_down
        
        # Calculate the Relative Strength Index
        relative_strength_index = 100.0 - (100.0 / (1.0 + relative_strength))
        
        # Add the info to the data frame.
        self.price_data['down_days'] = down_df['change_in_price']
        self.price_data['up_days'] = up_df['change_in_price']
        self.price_data['RSI'] = relative_strength_index

        # Calculate the Stochastic Oscillator
        # Make a copy of the high and low column.
        low_14, high_14 = self.price_data[['symbol','low']].copy(), self.price_data[['symbol','high']].copy()
        
        # Group by symbol, then apply the rolling function and grab the Min and Max.
        low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
        high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())
        
        # Calculate the Stochastic Oscillator.
        k_percent = 100 * ((self.price_data['close'] - low_14) / (high_14 - low_14))
        
        # Add the info to the data frame.
        self.price_data['low_14'] = low_14
        self.price_data['high_14'] = high_14
        self.price_data['k_percent'] = k_percent
        
        # Make a copy of the high and low column.
        low_14, high_14 = self.price_data[['symbol','low']].copy(), self.price_data[['symbol','high']].copy()
        
        # Group by symbol, then apply the rolling function and grab the Min and Max.
        low_14 = low_14.groupby('symbol')['low'].transform(lambda x: x.rolling(window = n).min())
        high_14 = high_14.groupby('symbol')['high'].transform(lambda x: x.rolling(window = n).max())
        
        # Calculate William %R indicator.
        r_percent = ((high_14 - self.price_data['close']) / (high_14 - low_14)) * - 100
        
        # Add the info to the data frame.
        self.price_data['r_percent'] = r_percent
        
        # Calculate the MACD
        ema_26 = self.price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 26).mean())
        ema_12 = self.price_data.groupby('symbol')['close'].transform(lambda x: x.ewm(span = 12).mean())
        macd = ema_12 - ema_26
        
        # Calculate the EMA
        ema_9_macd = macd.ewm(span = 9).mean()
        
        # Store the data in the data frame.
        self.price_data['MACD'] = macd
        self.price_data['MACD_EMA'] = ema_9_macd
        
        # Calculate the Price Rate of Change
        roc_n = 9
        
        # Calculate the Rate of Change in the Price, and store it in the Data Frame.
        self.price_data['Price_Rate_Of_Change'] = self.price_data.groupby('symbol')['close'].transform(lambda x: x.pct_change(periods = roc_n))
        
        # apply the function to each group
        # obv_groups = self.price_data.groupby('symbol').apply(self.obv)

        # self.Debug(f"show obv groups length")
        # self.Debug(len(obv_groups))

        # obv_groups going in one by one
        # self.Debug(f"show obv_groups")
        # self.Debug(len(obv_groups)) 

        # add to the data frame, but drop the old index, before adding it.
        # self.price_data['On Balance Volume'] = obv_groups.reset_index(level=0, drop=True)

        # self.price_data["On Balance Volume"] = obv_groups
        
        # self.Debug(f"show obv groups length")
        
        # self.Debug(len(obv_groups))

        # self.Debug(f"show on balance volume")

        # self.Debug(self.price_data.head())
        
        # Create a column we wish to predict
        '''
            In this case, let's create an output column that will be 1 if the closing price at time 't' is greater than 't-1' and 0 otherwise.
            In other words, if the today's closing price is greater than yesterday's closing price it would be 1.
        '''
        
        # Group by the `Symbol` column, then grab the `Close` column.
        close_groups = self.price_data.groupby('symbol')['close']
        
        # Apply the lambda function which will return -1.0 for down, 1.0 for up and 0.0 for no change.
        close_groups = close_groups.transform(lambda x : np.sign(x.diff()))
        
        # add the data to the main dataframe.
        self.price_data['Prediction'] = close_groups
        
        # for simplicity in later sections I'm going to make a change to our prediction column. To keep this as a binary classifier I'll change flat days and consider them up days.
        self.price_data.loc[self.price_data['Prediction'] == 0.0] = 1.0
        
        self.price_data = self.avg_true_range(self.price_data)

        # Any row that has a `NaN` value will be dropped.
        self.price_data = self.price_data.dropna()

        self.price_data.reset_index(drop = True)
        
        for i in range(len(self.price_data)):
            self.chandelier_long.append(self.price_data["high"][i:i+22].max() - (self.price_data["Avg TR"][i:i+22].mean() * 3))
            self.chandelier_short.append(self.price_data["low"][i:i+22].max() + (self.price_data["Avg TR"][i:i+22].mean() * 3))

        self.price_data["chandelier long"] = self.chandelier_long
        self.price_data["chandelier short"] = self.chandelier_short
        
        # Grab our X & Y Columns.
        # X_Cols = self.price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD','On Balance Volume']]
        X_Cols = self.price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD']]

        Y_Cols = self.price_data['Prediction']
            
        # Split X and y into X_
        X_train, X_test, y_train, y_test = train_test_split(X_Cols, Y_Cols, random_state = 0)
        
        # Create a Random Forest Classifier
        rand_frst_clf = RandomForestClassifier(n_estimators = 100, oob_score = True, criterion = "gini", random_state = 0)
        
        # Fit the data to the model
        rand_frst_clf.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rand_frst_clf.predict(X_test)
    
        # sort the values by symbol and then date
        # self.price_data.sort_values(by = ['symbol','datetime'], inplace = True)
        
        # self.Debug(f"Correct Prediction (%): ")
        # self.Debug(accuracy_score(y_test, rand_frst_clf.predict(X_test), normalize = True) * 100.0)
        
        # self.price_data['Prediction'] = self.price_data['Prediction'].shift(periods=1)
        
        self.price_data = self.price_data.dropna()

        self.test_df = self.price_data[ ["symbol", "close", "datetime", "Prediction", "chandelier long", "chandelier short"]]

    # def invest (self):
        today = self.Time
        
        date = str(today).split()[0]

        # self.Debug(self.test_df["datetime"])
        
        select_indices = list(np.where(self.test_df["datetime"] == date)[0])
    
        for i in select_indices:
            # self.Debug(self.test_df["symbol"].iloc[i])
            if self.test_df["Prediction"].iloc[i] == 1:
                self.SetHoldings( self.test_df["symbol"].iloc[i] , 1/len(self.trading_symbols))
                
                if self.test_df["close"].iloc[i] < self.test_df["chandelier long"].iloc[i]:
                    self.Liquidate(self.test_df["symbol"].iloc[i])
            
            elif self.test_df["Prediction"].iloc[i] == -1:
                self.SetHoldings(self.test_df["symbol"].iloc[i] , -1.0/len(self.trading_symbols))
            
                if self.test_df["close"].iloc[i] < self.test_df["chandelier short"].iloc[i]:
                    self.Liquidate(self.test_df["symbol"].iloc[i])
        
        # self.Debug(self.test_df)

Hi guys, I'm currently facing the following error:

"Runtime Error: In Scheduled Event 'MonthStart: SPY: 15 min after MarketOpen', ValueError : arrays must all be same length
at train in main.py:line 111"

The thing is, the code works fine for the first couple of iterations and only breaks after a couple of months.

Any help will be greatly appreciated, thank you!

The material on this website is provided for informational purposes only and does not constitute an offer to sell, a solicitation to buy, or a recommendation or endorsement for any security or strategy, nor does it constitute an offer to provide investment advisory services by QuantConnect. In addition, the material offers no opinion with respect to the suitability of any security or specific investment. QuantConnect makes no guarantees as to the accuracy or completeness of the views expressed in the website. The views are subject to change, and may have become unreliable for various reasons, including changes in market conditions or economic circumstances. All investments involve risk, including loss of principal. You should consult with an investment professional before making any investment decisions.

Hi Wei,
To address this, we can make sure all of the lists are of the same length before converting the dict to a DataFrame.

Furthermore, please do not mix AddEquity with Universe Selection, as the Universe Selection already adds the securities.
I've shown the changes in the attached backtest.
Best,
Shile Wen

Shile Wen

63.5k ,

Wei li INVESTOR

Update Backtest

Notebook

person upvoted this people upvoted this

To unlock posting to the community forums please complete at least 30% of Boot Camp.
You can continue your Boot Camp training progress from the terminal. We hope to see you in the community soon!

Radically Open-Source Algorithmic Trading Engine

Join Our Discord Channel

Draft Discussions

Bookmarked Discussions

SEARCH DISCUSSIONS

TOP 5 Research Publications

447,400 Quants.

VOTE FOR UPCOMING FEATURES

Help with array length in historical dataframe

Allocate to this Strategy

Organization

Team

Clone Strategy

Previous Ranking

IN THIS RESEARCH

PARTICIPANTS

Discussion Awards

Actions

Join QuantConnect for Free

SIGN IN

Radically Open-Source Algorithmic Trading Engine

Join Our Discord Channel

Draft Discussions

Bookmarked Discussions

SEARCH DISCUSSIONS

TOP 5 Research Publications

447,400 Quants.

VOTE FOR UPCOMING FEATURES

Help with array length in historical dataframe

Allocate to this Strategy

Organization

Team

Clone Strategy

Previous Ranking

IN THIS RESEARCH

PARTICIPANTS

Discussion Awards

SHARE RESEARCH

SHARE DISCUSSION

SHARE ARTICLE

SHARE

Actions

Join QuantConnect for Free