templates/python/sec-filing-research/main.py at dev · browserbase/templates · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# Stagehand + Browserbase: SEC Filing Research - See README.md for full documentation

import asyncio
import json
import os

from dotenv import load_dotenv
from pydantic import BaseModel, Field
from stagehand import AsyncStagehand


class CompanyInfo(BaseModel):
    """Schema for company information extraction."""

    companyName: str = Field(description="Official company name")  # noqa: N815
    cik: str = Field(description="Central Index Key (CIK) number")


class Filing(BaseModel):
    """Schema for a single SEC filing."""

    type: str = Field(description="Filing type (e.g., 10-K, 10-Q, 8-K)")
    date: str = Field(description="Filing date in YYYY-MM-DD format")
    description: str = Field(description="Full description of the filing")
    accessionNumber: str = Field(description="SEC accession number")  # noqa: N815
    fileNumber: str | None = Field(default=None, description="File/Film number")  # noqa: N815


class FilingsList(BaseModel):
    """Schema for extracting a list of SEC filings."""

    filings: list[Filing] = Field(description="List of SEC filings")


def dereference_schema(schema: dict) -> dict:
    """Inline all $ref references in a JSON schema for Gemini compatibility."""
    defs = schema.pop("$defs", {})

    def resolve_refs(obj):
        if isinstance(obj, dict):
            if "$ref" in obj:
                ref_path = obj["$ref"].split("/")[-1]
                return resolve_refs(defs.get(ref_path, {}))
            return {k: resolve_refs(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [resolve_refs(item) for item in obj]
        return obj

    return resolve_refs(schema)


# Load environment variables from .env file
# Required: BROWSERBASE_API_KEY
load_dotenv()

# Search query - can be company name, ticker symbol, or CIK number
# Examples: "Apple Inc", "AAPL", "0000320193"
SEARCH_QUERY = "Apple Inc"

# Number of filings to retrieve
NUM_FILINGS = 5


async def main():
    """
    Searches SEC EDGAR for a company (by name, ticker, or CIK) and extracts
    recent filing metadata: type, date, description, accession number, file number.
    Uses Stagehand + Browserbase for AI-powered browser automation.
    """
    print("Starting SEC Filing Research...")
    print(f"Search query: {SEARCH_QUERY}")
    print(f"Retrieving {NUM_FILINGS} most recent filings\n")

    # Initialize AsyncStagehand client (v3 architecture)
    # Uses environment variable: BROWSERBASE_API_KEY
    client = AsyncStagehand(
        browserbase_api_key=os.environ.get("BROWSERBASE_API_KEY"),
    )

    # Start a new browser session
    start_response = await client.sessions.start(model_name="google/gemini-2.5-flash")
    session_id = start_response.data.session_id
    print(f"Stagehand session started: {session_id}")

    try:
        # Provide live session URL for debugging and monitoring
        print(f"Live View: https://browserbase.com/sessions/{session_id}")

        # Navigate to modern SEC EDGAR company search page
        print("\nNavigating to SEC EDGAR...")
        await client.sessions.navigate(
            id=session_id,
            url="https://www.sec.gov/edgar/searchedgar/companysearch.html",
        )

        # Enter search query in the Company and Person Lookup search box
        print(f"Searching for: {SEARCH_QUERY}")
        await client.sessions.act(
            id=session_id,
            input="Click on the Company and Person Lookup search textbox",
        )
        await client.sessions.act(
            id=session_id,
            input=f'Type "{SEARCH_QUERY}" in the search field',
        )

        # Submit search to load company results
        await client.sessions.act(id=session_id, input="Click the search submit button")

        # Select the matching company from results to view their filings page
        print("Selecting the correct company from results...")
        await client.sessions.act(
            id=session_id,
            input=f'Click on "{SEARCH_QUERY}" in the search results to view their filings',
        )

        # Extract company information from the filings page
        print("Extracting company information...")
        company_info = {"companyName": SEARCH_QUERY, "cik": "Unknown"}
        try:
            extract_response = await client.sessions.extract(
                id=session_id,
                instruction=(
                    "Extract the company name and CIK number from the page"
                    " header or company information section."
                    " The CIK should be a numeric identifier."
                ),
                schema=dereference_schema(CompanyInfo.model_json_schema()),
            )
            extracted = extract_response.data.result
            if extracted and isinstance(extracted, dict) and extracted.get("companyName"):
                company_info = extracted
        except Exception as error:
            print(f"Could not extract company info, using search query as company name: {error}")

        # Extract filing metadata from the filings table using structured schema
        print(f"Extracting the {NUM_FILINGS} most recent filings...")
        filings_response = await client.sessions.extract(
            id=session_id,
            instruction=(
                f"Extract the {NUM_FILINGS} most recent SEC filings from"
                " the filings table. For each filing, get: the filing"
                " type (column: Filings, like 10-K, 10-Q, 8-K), the"
                " filing date (column: Filing Date), description,"
                " accession number (from the link or description),"
                " and file/film number if shown."
            ),
            schema=dereference_schema(FilingsList.model_json_schema()),
        )
        filings_data = filings_response.data.result

        # Build result object with company info and normalized filing list
        filings_list = (filings_data.get("filings") or [])[:NUM_FILINGS] if filings_data else []
        result = {
            "company": company_info.get("companyName", SEARCH_QUERY),
            "cik": company_info.get("cik", "Unknown"),
            "searchQuery": SEARCH_QUERY,
            "filings": [
                {
                    "type": f.get("type", ""),
                    "date": f.get("date", ""),
                    "description": f.get("description", ""),
                    "accessionNumber": f.get("accessionNumber", ""),
                    "fileNumber": f.get("fileNumber", ""),
                }
                for f in filings_list
            ],
        }

        # Log summary and per-filing details to console
        print("\n" + "=" * 60)
        print("SEC FILING METADATA")
        print("=" * 60)
        print(f"Company: {result['company']}")
        print(f"CIK: {result['cik']}")
        print(f"Search Query: {result['searchQuery']}")
        print(f"Filings Retrieved: {len(result['filings'])}")
        print("=" * 60)

        # Display each filing's type, date, description, accession number, file number
        for index, filing in enumerate(result["filings"], start=1):
            print(f"\nFiling {index}:")
            print(f"  Type: {filing['type']}")
            print(f"  Date: {filing['date']}")
            desc = filing["description"]
            print(f"  Description: {desc[:80]}{'...' if len(desc) > 80 else ''}")
            print(f"  Accession Number: {filing['accessionNumber']}")
            print(f"  File Number: {filing['fileNumber']}")

        # Output full result as JSON for piping or integration
        print("\n" + "=" * 60)
        print("JSON OUTPUT:")
        print("=" * 60)
        print(json.dumps(result, indent=2))

    finally:
        # Always close session to release resources and clean up
        await client.sessions.end(id=session_id)
        print("\nSession closed successfully")


if __name__ == "__main__":
    try:
        asyncio.run(main())
    except Exception as err:
        print(f"Application error: {err}")
        # Provide helpful troubleshooting information
        print("\nCommon issues:")
        print("  - Check .env file has BROWSERBASE_API_KEY")
        print("  - Verify internet connection and SEC website accessibility")
        print("  - Ensure the search query is valid (company name, ticker, or CIK)")
        print("Docs: https://docs.stagehand.dev/v3/sdk/python")
        exit(1)