Help with Stablebaselines ML model

Just starting out with quantconnect that that long ago, I started working on editing on of the example ML examples:

Currently, the model is behaving very strangely and I don't know what to do, I suspect that may have something to do with the saving and loading of the model, since if I just renew and reset the model each time I train, it behaves normally. Here's my code below and the code that it is inspired and based on:

# region imports
from AlgorithmImports import *
import gym
from stable_baselines3 import PPO
# endregion

class StableBaselinesExampleAlgorithm(QCAlgorithm):
    
    def Initialize(self):
        self.SetStartDate(2015, 7, 4)
        self.SetCash(100000)
        self.spy = self.AddEquity("SPY", Resolution.Daily).Symbol
        self.index = True
        training_length = 252*2
        self.training_data = RollingWindow[TradeBar](training_length)
        history = self.History[TradeBar](self.spy, training_length, Resolution.Daily)
        for trade_bar in history:
            self.training_data.Add(trade_bar)
        
        self.Train(self.my_training_method)
        self.Train(self.DateRules.MonthEnd(), self.TimeRules.At(8,0), self.my_training_method)

    def get_observations_and_rewards(self, n_step=15):
        training_df = self.PandasConverter.GetDataFrame[TradeBar](list(self.training_data)[::-1])
        daily_pct_change = training_df['close'].pct_change().dropna()

        obs = []
        rewards = []
        for i in range(len(daily_pct_change)-n_step):
            obs.append(training_df.iloc[i:i+n_step].values)
            rewards.append(float(daily_pct_change.iloc[i+n_step]))
        obs = np.array(obs)
        rewards = np.array(rewards)

        return obs, rewards

    def my_training_method(self):
        obs, rewards = self.get_observations_and_rewards()
        
        self.env = TradingEnv(obs, rewards)

        self.model_key = "ppo_model"  # Key name for the model in Object Store
        if self.ObjectStore.ContainsKey(self.model_key):
            file_name = self.ObjectStore.GetFilePath(self.model_key)
            self.model = PPO.load(file_name, env=self.env)  # Make sure to initialize `env` before this step
            self.Log("loaded")
        else:
            self.model = PPO("MlpPolicy", self.env, learning_rate= 0.0005)
            self.Log("new")
        
        self.model.learn(total_timesteps=500)

        file_name = self.ObjectStore.GetFilePath(self.model_key)
        self.model.save(file_name)


    def OnData(self, data):
        if not self.model:
            return  # Model not yet initialized
        features, _ = self.get_observations_and_rewards()
        action, _ = self.model.predict(features[-15:], deterministic=True)
        _, _, _, _ = self.env.step(action)
        self.Log(action)
        
        if action == 0:
            self.Liquidate(self.spy)
        elif action == 1:
            # if self.spy.IsShort:
            #     self.Liquidate(self.spy)
            self.SetHoldings(self.spy, 1)

        elif action == 2:
            # if self.spy.IsLong:
            #     self.Liquidate(self.spy)
            self.SetHoldings(self.spy, -1)
            
class TradingEnv(gym.Env):
    FLAT = 0
    LONG = 1
    SHORT = 2

    def __init__(self, ohlcv, ret):
        super(TradingEnv, self).__init__()
        
        self.ohlcv = ohlcv
        self.ret = ret
        self.trading_cost = 0.01
        self.reward = 1
        
        self.current_step = 15
        # The last action
        self.last_action = 0

        # Define action and observation space
        # Example when using discrete actions, we have 3: LONG, SHORT and FLAT.
        n_actions = 3
        self.action_space = gym.spaces.Discrete(n_actions)
        self.observation_space = gym.spaces.Box(low=-2, high=2, shape=(15, 15, 5), dtype=np.float64)

    def reset(self):
        # Reset the number of step the training has taken
        self.current_step = 15
        # Reset the last action
        self.last_action = 0
        # must return np.array type
        return self.ohlcv[self.current_step-15:self.current_step].astype(np.float32)

    def step(self, action):
        if action == self.LONG:
            self.reward *= 1 + self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.SHORT:
            self.reward *= 1 + -1 * self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.FLAT:
             self.reward *= 1 - (self.trading_cost if self.last_action != action else 0)
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
            
        self.last_action = action
        self.current_step += 1

        # Have we iterate all data points?
        done = (self.current_step == self.ret.shape[0]-1)

        # Reward as return
        return self.ohlcv[self.current_step-15:self.current_step].astype(np.float32), self.reward, done, {}

and here's what the code is based on:

# region imports
from AlgorithmImports import *
import gym
from stable_baselines3 import DQN
# endregion

class StableBaselinesExampleAlgorithm(QCAlgorithm):
    
    def Initialize(self):
        self.SetStartDate(2022, 7, 4)
        self.SetCash(100000)
        self.spy = self.AddEquity("SPY", Resolution.Daily).Symbol

        training_length = 252*2
        self.training_data = RollingWindow[TradeBar](training_length)
        history = self.History[TradeBar](self.spy, training_length, Resolution.Daily)
        for trade_bar in history:
            self.training_data.Add(trade_bar)

        self.Train(self.my_training_method)
        self.Train(self.DateRules.Every(DayOfWeek.Sunday), self.TimeRules.At(8,0), self.my_training_method)
        
    def get_observations_and_rewards(self, n_step=5):
        training_df = self.PandasConverter.GetDataFrame[TradeBar](list(self.training_data)[::-1])
        daily_pct_change = training_df['close'].pct_change().dropna()

        obs = []
        rewards = []
        for i in range(len(daily_pct_change)-n_step):
            obs.append(training_df.iloc[i:i+n_step].values)
            rewards.append(float(daily_pct_change.iloc[i+n_step]))
        obs = np.array(obs)
        rewards = np.array(rewards)

        return obs, rewards

    def my_training_method(self):
        obs, rewards = self.get_observations_and_rewards()
        self.env = TradingEnv(obs, rewards)
        self.model = DQN("MlpPolicy", self.env)
        self.model.learn(total_timesteps=500)

    def OnData(self, data):
        features, _ = self.get_observations_and_rewards()
        action, _ = self.model.predict(features[-5:], deterministic=True)
        _, _, _, _ = self.env.step(action)

        if action == 0:
            self.Liquidate(self.spy)
        elif action == 1:
            self.SetHoldings(self.spy, 1)
        elif action == 2:
            self.SetHoldings(self.spy, -1)
            
class TradingEnv(gym.Env):
    FLAT = 0
    LONG = 1
    SHORT = 2

    def __init__(self, ohlcv, ret):
        super(TradingEnv, self).__init__()
        
        self.ohlcv = ohlcv
        self.ret = ret
        self.trading_cost = 0.01
        self.reward = 1
        
        # The number of step the training has taken, starts at 5 since we're using the previous 5 data for observation.
        self.current_step = 5
        # The last action
        self.last_action = 0

        # Define action and observation space
        # Example when using discrete actions, we have 3: LONG, SHORT and FLAT.
        n_actions = 3
        self.action_space = gym.spaces.Discrete(n_actions)
        # The observation will be the coordinate of the agent, shape for (5 previous data poionts, OHLCV)
        self.observation_space = gym.spaces.Box(low=-2, high=2, shape=(5, 5, 5), dtype=np.float64)

    def reset(self):
        # Reset the number of step the training has taken
        self.current_step = 5
        # Reset the last action
        self.last_action = 0
        # must return np.array type
        return self.ohlcv[self.current_step-5:self.current_step].astype(np.float32)

    def step(self, action):
        if action == self.LONG:
            self.reward *= 1 + self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.SHORT:
            self.reward *= 1 + -1 * self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.FLAT:
             self.reward *= 1 - (self.trading_cost if self.last_action != action else 0)
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
            
        self.last_action = action
        self.current_step += 1

        # Have we iterate all data points?
        done = (self.current_step == self.ret.shape[0]-1)

        # Reward as return
        return self.ohlcv[self.current_step-5:self.current_step].astype(np.float32), self.reward, done, {}

The material on this website is provided for informational purposes only and does not constitute an offer to sell, a solicitation to buy, or a recommendation or endorsement for any security or strategy, nor does it constitute an offer to provide investment advisory services by QuantConnect. In addition, the material offers no opinion with respect to the suitability of any security or specific investment. QuantConnect makes no guarantees as to the accuracy or completeness of the views expressed in the website. The views are subject to change, and may have become unreliable for various reasons, including changes in market conditions or economic circumstances. All investments involve risk, including loss of principal. You should consult with an investment professional before making any investment decisions.

To assist "Jason" after they've mentioned "Here's a backtest:", you might want to ask for more details about the backtest or if they're encountering any specific issues with it. For example, you could ask:

"Could you provide more details about what your backtest is aiming to achieve?"
"Are you experiencing any issues or errors with your backtest that you need help with?"
"Would you like feedback on the performance or the strategy used in your backtest?"

These questions can help guide "Jason" to provide more information, making it easier to offer targeted assistance or advice.

Jason Shen

478 ,

Here's a backtest:

Mia Alissi

STAFF ,

Jason Shen INVESTOR

Update Backtest

Notebook

person upvoted this people upvoted this

To unlock posting to the community forums please complete at least 30% of Boot Camp.
You can continue your Boot Camp training progress from the terminal. We hope to see you in the community soon!

Radically Open-Source Algorithmic Trading Engine

Join Our Discord Channel

Draft Discussions

Bookmarked Discussions

SEARCH DISCUSSIONS

TOP 5 Research Publications

444,300 Quants.

VOTE FOR UPCOMING FEATURES

Help with Stablebaselines ML model

Allocate to this Strategy

Organization

Team

Clone Strategy

Previous Ranking

IN THIS RESEARCH

PARTICIPANTS

Discussion Awards

Actions

Join QuantConnect for Free

SIGN IN

Radically Open-Source Algorithmic Trading Engine

Join Our Discord Channel

Draft Discussions

Bookmarked Discussions

SEARCH DISCUSSIONS

TOP 5 Research Publications

444,300 Quants.

VOTE FOR UPCOMING FEATURES

Help with Stablebaselines ML model

Allocate to this Strategy

Organization

Team

Clone Strategy

Previous Ranking

IN THIS RESEARCH

PARTICIPANTS

Discussion Awards

SHARE RESEARCH

SHARE DISCUSSION

SHARE ARTICLE

SHARE

Actions

Join QuantConnect for Free