Overall Statistics
from AlgorithmImports import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


class LazyPricesStrategy(QCAlgorithm):

    def initialize(self):
        self.set_start_date(2022, 1, 1)
        self.set_end_date(2024, 1, 1)
        self.set_cash(1_000_000)

        self.set_warm_up(timedelta(days=400))

        self.spy = self.add_equity("SPY", Resolution.DAILY).symbol
        self.set_benchmark(self.spy)

        # ── Parameters ────────────────────────────────────────────────────────
        self.n_long_short        = 5
        self.max_signal_age      = 400
        # self.max_text_len        = 100_000
        self.min_sim_threshold   = 0.15
        self._max_subscriptions  = 50

        # ── State ─────────────────────────────────────────────────────────────
        self.prev_filing_text    = {}
        self.sim_scores          = {}
        self.sec_sym_to_ticker   = {}
        self._subscribed_tickers = set()

        # ── Universe ──────────────────────────────────────────────────────────
        self.universe_settings.resolution = Resolution.DAILY
        self.add_universe(self._select_universe)

        # ── Monthly rebalance ─────────────────────────────────────────────────
        self.schedule.on(
            self.date_rules.month_start(self.spy),
            self.time_rules.after_market_open(self.spy, 30),
            self._rebalance
        )

    # ─────────────────────────────────────────────────────────────────────────
    # Universe: top 50 most liquid US large-cap equities
    # ─────────────────────────────────────────────────────────────────────────
    def _select_universe(self, fundamental):
        eligible = [
            f for f in fundamental
            if f.has_fundamental_data
            and f.dollar_volume > 1e8
            and f.market_cap    > 5e9
        ]
        return [
            f.symbol
            for f in sorted(eligible, key=lambda f: f.dollar_volume, reverse=True)[:50]
        ]

    # ─────────────────────────────────────────────────────────────────────────
    # Subscribe / unsubscribe SEC 10-K data as stocks enter / leave universe
    # ─────────────────────────────────────────────────────────────────────────
    def on_securities_changed(self, changes):
        for security in changes.added_securities:
            if security.type != SecurityType.EQUITY:
                continue
            equity_sym = security.symbol
            ticker     = equity_sym.value
            if ticker in self._subscribed_tickers:
                continue
            if len(self._subscribed_tickers) >= self._max_subscriptions:
                continue
            sec_sym = self.add_data(SECReport10K, equity_sym, Resolution.DAILY).symbol
            self.sec_sym_to_ticker[sec_sym] = ticker
            self._subscribed_tickers.add(ticker)

        for security in changes.removed_securities:
            if security.type != SecurityType.EQUITY:
                continue
            ticker = security.symbol.value
            dataset_sym = next(
                (s for s, t in self.sec_sym_to_ticker.items() if t == ticker), None
            )
            if dataset_sym:
                self.remove_security(dataset_sym)
                del self.sec_sym_to_ticker[dataset_sym]

    # ─────────────────────────────────────────────────────────────────────────
    # Process 10-K filings
    # ─────────────────────────────────────────────────────────────────────────
    def on_data(self, data):
        for report in data.get(SECReport10K).values():
            ticker = report.symbol.underlying.value
            text   = self._extract_text(report)
            if not text:
                continue

            if ticker in self.prev_filing_text:
                score = self._cosine_sim(self.prev_filing_text[ticker], text)
                if score is not None:
                    self.sim_scores[ticker] = (score, self.time)
            self.prev_filing_text[ticker] = text

    # ─────────────────────────────────────────────────────────────────────────
    # Extract Text
    # ─────────────────────────────────────────────────────────────────────────
    def _extract_text(self, report):
        try:
            docs  = report.report.documents
            parts = [doc.text for doc in docs if doc.text and len(doc.text) > 50]
            text  = " ".join(parts)
            return text if len(text) > 200 else None
        except Exception as e:
            self.debug(f"Extract error: {e}")
            return None

    # ─────────────────────────────────────────────────────────────────────────
    # TF-IDF cosine similarity
    # ─────────────────────────────────────────────────────────────────────────
    def _cosine_sim(self, text_a, text_b):
        try:
            vec = TfidfVectorizer(
                max_features = 5_000,
                stop_words   = "english",
                sublinear_tf = True,
            )
            mat = vec.fit_transform([text_a, text_b])
            return float(cosine_similarity(mat[0:1], mat[1:2])[0][0])
        except Exception as e:
            self.debug(f"Similarity error: {e}")
            return None

    # ─────────────────────────────────────────────────────────────────────────
    # Monthly rebalance: long high-similarity, short low-similarity
    # ─────────────────────────────────────────────────────────────────────────
    def _rebalance(self):
        if self.is_warming_up:
            return

        signals      = {}
        seen_tickers = set()

        for symbol, security in self.active_securities.items():
            if security.type != SecurityType.EQUITY:
                continue
            ticker = symbol.value
            if ticker in seen_tickers:
                continue

            entry = self.sim_scores.get(ticker)
            if entry is None:
                continue

            score, filing_date = entry

            if score < self.min_sim_threshold:
                self.debug(f"{self.time.date()} | {ticker} | score {score:.4f} below threshold, skipping")
                continue

            if (self.time - filing_date).days <= self.max_signal_age:
                signals[symbol] = score
                seen_tickers.add(ticker)

        min_needed = 2 * self.n_long_short
        if len(signals) < min_needed:
            self.debug(f"{self.time.date()} | {len(signals)} signals < {min_needed}. Skipping.")
            return

        ranked     = sorted(signals, key=signals.get)
        short_syms = ranked[:self.n_long_short]
        long_syms  = ranked[-self.n_long_short:]

        active_set = set(long_syms + short_syms)
        for sym, holding in self.portfolio.items():
            if holding.invested and sym not in active_set:
                self.liquidate(sym)

        w = 0.5 / self.n_long_short
        targets = (
            [PortfolioTarget(s,  w) for s in long_syms] +
            [PortfolioTarget(s, -w) for s in short_syms]
        )
        self.set_holdings(targets)

        self.debug(
            f"{self.time.date()} | Signals={len(signals)} | "
            f"Long={[s.value for s in long_syms]} | "
            f"Short={[s.value for s in short_syms]}"
        )
        self.plot("Strategy", "Signal Count", len(signals))

    def on_end_of_algorithm(self):
        self.debug(f"Final Value    : ${self.portfolio.total_portfolio_value:,.2f}")
        self.debug(f"Scored tickers : {len(self.sim_scores)}")
        self.debug(f"Stored filings : {len(self.prev_filing_text)}")
        for ticker, (score, date) in sorted(
            self.sim_scores.items(), key=lambda x: x[1][0]
        ):
            self.debug(f"  {ticker}: sim={score:.4f} (filed {date.date()})")