ElementAI · RaymondLi0 · Mar 25, 2026 · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/.agents/skills b/.agents/skills
@@ -0,0 +1 @@
+../skills
diff --git a/.claude/skills b/.claude/skills
@@ -0,0 +1 @@
+../skills
diff --git a/.coderabbit.yaml b/.coderabbit.yaml
@@ -0,0 +1,39 @@
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+
+# Only comment on Critical/Major bugs. No Minor, Trivial, or style comments.
+tone_instructions: "Only comment on Critical or Major bugs. Never comment on Minor issues, style, refactoring, or suggestions. When in doubt, stay silent."
+
+reviews:
+  # Use chill profile - filters out nitpicks automatically
+  profile: "chill"
+
+  # Disable all summary features
+  high_level_summary: false
+  high_level_summary_in_walkthrough: false
+
+  # Disable walkthrough comment entirely
+  collapse_walkthrough: true
+  changed_files_summary: false
+  sequence_diagrams: false
+
+  # Disable status/effort estimates
+  review_status: false
+  commit_status: false
+  estimate_code_review_effort: false
+
+  # Disable auto-suggestions for labels/reviewers
+  suggested_labels: false
+  suggested_reviewers: false
+
+  # Disable related issues/PRs lookup
+  assess_linked_issues: false
+  related_issues: false
+  related_prs: false
+
+  # Auto-review disabled - only review when explicitly requested via @coderabbitai review
+  auto_review:
+    enabled: false
+
+chat:
+  auto_reply: true
diff --git a/.cursorrules b/.cursorrules
@@ -0,0 +1 @@
+See CLAUDE.md for all repository guidelines.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+extend-ignore = E203,E501,F401,E402,E714
+per-file-ignores = __init__.py:F401
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1,67 @@
+megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
+
+megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt
+
+megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal
+
+megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
+megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
+
+megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model
+
+megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
+
+megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers
+
+megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
+
+megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
+
+megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing
+
+megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer
+
+megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference
+
+megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
+
+megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism
+
+megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/transformer
+
+megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech
+
+megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference
+
+megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
+
+megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
+
+megatron/post_training/ @NVIDIA/post-training
+
+megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs
+
+megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo
+megatron/training/arguments.py
+
+.gitlab/ @NVIDIA/ci
+.github/ @NVIDIA/ci
+.github/oncall_schedule.json @NVIDIA/mcore-oncall-rotation
+.gitlab-ci.yml @NVIDIA/ci
+docker/  @NVIDIA/ci
+tests/functional_tests/python_test_utils/ @NVIDIA/ci
+tests/functional_tests/shell_test_utils/ @NVIDIA/ci
+tests/test_utils/recipes/ @NVIDIA/ci
+tests/unit_tests/run_ci_test.sh @NVIDIA/ci
+
+# API Backwards Compatibility Check
+scripts/check_api_backwards_compatibility.py @NVIDIA/ci
+scripts/README_API_COMPAT.md @NVIDIA/ci
+.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci
+docs/api-backwards-compatibility-check.md @NVIDIA/ci
+tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci
+
+megatron/rl/ @NVIDIA/reinforcement-learning
+examples/rl/ @NVIDIA/reinforcement-learning
+test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
+train_rl.py @NVIDIA/reinforcement-learning
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,29 @@
+---
+name: Bug report
+about: Create a report to help us improve the repository or project
+title: ""
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+
+A clear and concise description of what the bug is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
+
+**Steps/Code to reproduce bug**
+
+Please list *minimal* steps or code snippet for us to be able to reproduce the bug.
+
+A helpful guide on on how to craft a minimal bug report http://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports.
+
+
+**Expected behavior**
+
+A clear and concise description of what you expected to happen.
+
+
+**Additional context**
+
+Add any other context about the problem here. 
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,2 @@
+blank_issues_enabled: false
+
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,23 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ""
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
@@ -0,0 +1,13 @@
+---
+name: QUESTION
+about: Ask a question about Megatron-LM that is not a bug, regression or enhancement
+  request
+title: "[QUESTION]"
+labels: ''
+assignees: ''
+
+---
+
+**Your question**
+Ask a clear and concise question about Megatron-LM. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
diff --git a/.github/ISSUE_TEMPLATE/regression.md b/.github/ISSUE_TEMPLATE/regression.md
@@ -0,0 +1,40 @@
+---
+name: REGRESSION
+about: Report a regression in speed or accuracy due to a Megatron-LM update
+title: "[REGRESSION]"
+labels: ''
+assignees: ''
+
+---
+
+**Describe the regression**
+A clear and concise description of what the regression is. Tag the [@mcore-oncall](https://github.com/orgs/NVIDIA/teams/mcore-oncall) 
+to get oncall's attention to this issue.
+
+**To Reproduce**
+Steps to reproduce the behavior. The easier it is to reproduce the faster it will get maintainer attention.
+
+**Previous performance**
+What speed or accuracy did you previously see.
+
+**New performance**
+What speed or accuracy do you see after the update.
+
+**Stack trace/logs**
+If applicable, add the stack trace or logs related to the regression.
+
+**Environment (please complete the following information):**
+ - Previous Megatron-LM commit ID
+ - New Megatron-LM commit ID
+ - Previous PyTorch version
+ - New PyTorch version
+ - Previous CUDA version
+ - New CUDA version
+ - Previous NCCL version
+ - New NCCL version
+
+**Proposed fix**
+If you have a proposal for how to fix the issue state it here or link to a PR.
+
+**Additional context**
+Add any other context about the problem here.