Just starting out with quantconnect that that long ago, I started working on editing on of the example ML examples:

Currently, the model is behaving very strangely and I don't know what to do, I suspect that may have something to do with the saving and loading of the model, since if I just renew and reset the model each time I train, it behaves normally. Here's my code below and the code that it is inspired and based on: 

# region imports
from AlgorithmImports import *
import gym
from stable_baselines3 import PPO
# endregion

class StableBaselinesExampleAlgorithm(QCAlgorithm):
    
    def Initialize(self):
        self.SetStartDate(2015, 7, 4)
        self.SetCash(100000)
        self.spy = self.AddEquity("SPY", Resolution.Daily).Symbol
        self.index = True
        training_length = 252*2
        self.training_data = RollingWindow[TradeBar](training_length)
        history = self.History[TradeBar](self.spy, training_length, Resolution.Daily)
        for trade_bar in history:
            self.training_data.Add(trade_bar)
        
        self.Train(self.my_training_method)
        self.Train(self.DateRules.MonthEnd(), self.TimeRules.At(8,0), self.my_training_method)

    def get_observations_and_rewards(self, n_step=15):
        training_df = self.PandasConverter.GetDataFrame[TradeBar](list(self.training_data)[::-1])
        daily_pct_change = training_df['close'].pct_change().dropna()

        obs = []
        rewards = []
        for i in range(len(daily_pct_change)-n_step):
            obs.append(training_df.iloc[i:i+n_step].values)
            rewards.append(float(daily_pct_change.iloc[i+n_step]))
        obs = np.array(obs)
        rewards = np.array(rewards)

        return obs, rewards

    def my_training_method(self):
        obs, rewards = self.get_observations_and_rewards()
        
        self.env = TradingEnv(obs, rewards)

        self.model_key = "ppo_model"  # Key name for the model in Object Store
        if self.ObjectStore.ContainsKey(self.model_key):
            file_name = self.ObjectStore.GetFilePath(self.model_key)
            self.model = PPO.load(file_name, env=self.env)  # Make sure to initialize `env` before this step
            self.Log("loaded")
        else:
            self.model = PPO("MlpPolicy", self.env, learning_rate= 0.0005)
            self.Log("new")
        
        self.model.learn(total_timesteps=500)

        file_name = self.ObjectStore.GetFilePath(self.model_key)
        self.model.save(file_name)


    def OnData(self, data):
        if not self.model:
            return  # Model not yet initialized
        features, _ = self.get_observations_and_rewards()
        action, _ = self.model.predict(features[-15:], deterministic=True)
        _, _, _, _ = self.env.step(action)
        self.Log(action)
        
        if action == 0:
            self.Liquidate(self.spy)
        elif action == 1:
            # if self.spy.IsShort:
            #     self.Liquidate(self.spy)
            self.SetHoldings(self.spy, 1)

        elif action == 2:
            # if self.spy.IsLong:
            #     self.Liquidate(self.spy)
            self.SetHoldings(self.spy, -1)
            
class TradingEnv(gym.Env):
    FLAT = 0
    LONG = 1
    SHORT = 2

    def __init__(self, ohlcv, ret):
        super(TradingEnv, self).__init__()
        
        self.ohlcv = ohlcv
        self.ret = ret
        self.trading_cost = 0.01
        self.reward = 1
        
        self.current_step = 15
        # The last action
        self.last_action = 0

        # Define action and observation space
        # Example when using discrete actions, we have 3: LONG, SHORT and FLAT.
        n_actions = 3
        self.action_space = gym.spaces.Discrete(n_actions)
        self.observation_space = gym.spaces.Box(low=-2, high=2, shape=(15, 15, 5), dtype=np.float64)

    def reset(self):
        # Reset the number of step the training has taken
        self.current_step = 15
        # Reset the last action
        self.last_action = 0
        # must return np.array type
        return self.ohlcv[self.current_step-15:self.current_step].astype(np.float32)

    def step(self, action):
        if action == self.LONG:
            self.reward *= 1 + self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.SHORT:
            self.reward *= 1 + -1 * self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.FLAT:
             self.reward *= 1 - (self.trading_cost if self.last_action != action else 0)
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
            
        self.last_action = action
        self.current_step += 1

        # Have we iterate all data points?
        done = (self.current_step == self.ret.shape[0]-1)

        # Reward as return
        return self.ohlcv[self.current_step-15:self.current_step].astype(np.float32), self.reward, done, {}

 

and here's what the code is based on:

# region imports
from AlgorithmImports import *
import gym
from stable_baselines3 import DQN
# endregion

class StableBaselinesExampleAlgorithm(QCAlgorithm):
    
    def Initialize(self):
        self.SetStartDate(2022, 7, 4)
        self.SetCash(100000)
        self.spy = self.AddEquity("SPY", Resolution.Daily).Symbol

        training_length = 252*2
        self.training_data = RollingWindow[TradeBar](training_length)
        history = self.History[TradeBar](self.spy, training_length, Resolution.Daily)
        for trade_bar in history:
            self.training_data.Add(trade_bar)

        self.Train(self.my_training_method)
        self.Train(self.DateRules.Every(DayOfWeek.Sunday), self.TimeRules.At(8,0), self.my_training_method)
        
    def get_observations_and_rewards(self, n_step=5):
        training_df = self.PandasConverter.GetDataFrame[TradeBar](list(self.training_data)[::-1])
        daily_pct_change = training_df['close'].pct_change().dropna()

        obs = []
        rewards = []
        for i in range(len(daily_pct_change)-n_step):
            obs.append(training_df.iloc[i:i+n_step].values)
            rewards.append(float(daily_pct_change.iloc[i+n_step]))
        obs = np.array(obs)
        rewards = np.array(rewards)

        return obs, rewards

    def my_training_method(self):
        obs, rewards = self.get_observations_and_rewards()
        self.env = TradingEnv(obs, rewards)
        self.model = DQN("MlpPolicy", self.env)
        self.model.learn(total_timesteps=500)

    def OnData(self, data):
        features, _ = self.get_observations_and_rewards()
        action, _ = self.model.predict(features[-5:], deterministic=True)
        _, _, _, _ = self.env.step(action)

        if action == 0:
            self.Liquidate(self.spy)
        elif action == 1:
            self.SetHoldings(self.spy, 1)
        elif action == 2:
            self.SetHoldings(self.spy, -1)
            
class TradingEnv(gym.Env):
    FLAT = 0
    LONG = 1
    SHORT = 2

    def __init__(self, ohlcv, ret):
        super(TradingEnv, self).__init__()
        
        self.ohlcv = ohlcv
        self.ret = ret
        self.trading_cost = 0.01
        self.reward = 1
        
        # The number of step the training has taken, starts at 5 since we're using the previous 5 data for observation.
        self.current_step = 5
        # The last action
        self.last_action = 0

        # Define action and observation space
        # Example when using discrete actions, we have 3: LONG, SHORT and FLAT.
        n_actions = 3
        self.action_space = gym.spaces.Discrete(n_actions)
        # The observation will be the coordinate of the agent, shape for (5 previous data poionts, OHLCV)
        self.observation_space = gym.spaces.Box(low=-2, high=2, shape=(5, 5, 5), dtype=np.float64)

    def reset(self):
        # Reset the number of step the training has taken
        self.current_step = 5
        # Reset the last action
        self.last_action = 0
        # must return np.array type
        return self.ohlcv[self.current_step-5:self.current_step].astype(np.float32)

    def step(self, action):
        if action == self.LONG:
            self.reward *= 1 + self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.SHORT:
            self.reward *= 1 + -1 * self.ret[self.current_step] - (self.trading_cost if self.last_action != action else 0)
        elif action == self.FLAT:
             self.reward *= 1 - (self.trading_cost if self.last_action != action else 0)
        else:
            raise ValueError("Received invalid action={} which is not part of the action space".format(action))
            
        self.last_action = action
        self.current_step += 1

        # Have we iterate all data points?
        done = (self.current_step == self.ret.shape[0]-1)

        # Reward as return
        return self.ohlcv[self.current_step-5:self.current_step].astype(np.float32), self.reward, done, {}