Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 62 additions & 1 deletion nodescraper/base/regexanalyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,11 @@
###############################################################################
import datetime
import re
from typing import Optional, Union
from typing import Optional, Sequence, Union

from pydantic import BaseModel

from nodescraper.base.match_ignore import ParsedIgnoreMatchRule, should_ignore_match
from nodescraper.enums import EventCategory, EventPriority
from nodescraper.generictypes import TAnalyzeArg, TDataModel
from nodescraper.interfaces.dataanalyzertask import DataAnalyzer
Expand Down Expand Up @@ -121,6 +122,51 @@ def _extract_timestamp_from_match_position(
timestamp_match = self.TIMESTAMP_PATTERN.search(first_line)
return timestamp_match.group(1) if timestamp_match else None

def _line_at_match_position(self, content: str, match_start: int) -> str:
"""Return the full line containing a regex match start position.

Args:
content: Full content being analyzed.
match_start: Start position of the regex match.

Returns:
str: Line text containing the match.
"""
line_start = content.rfind("\n", 0, match_start) + 1
line_end = content.find("\n", match_start)
if line_end == -1:
line_end = len(content)
return content[line_start:line_end]

def _should_ignore_regex_match(
self,
content: str,
match_start: int,
match_text: str,
error_regex_message: str,
ignore_match_rules: Sequence[ParsedIgnoreMatchRule],
) -> bool:
"""Return True when ignore_match_rules say to skip this regex hit.

Args:
content: Full content being analyzed.
match_start: Start position of the regex match.
match_text: Regex match text.
error_regex_message: ErrorRegex.message for the pattern that matched.
ignore_match_rules: Parsed ignore rules.

Returns:
bool: True when the match should be skipped.
"""
if not ignore_match_rules:
return False
return should_ignore_match(
line=self._line_at_match_position(content, match_start),
match_text=match_text,
error_regex_message=error_regex_message,
rules=ignore_match_rules,
)

def _convert_and_extend_error_regex(
self,
custom_regex: Optional[Union[list[ErrorRegex], list[dict]]],
Expand Down Expand Up @@ -198,13 +244,15 @@ def check_all_regexes(
group: bool = True,
num_timestamps: int = 3,
interval_to_collapse_event: int = 60,
ignore_match_rules: Optional[Sequence[ParsedIgnoreMatchRule]] = None,
) -> list[RegexEvent]:
"""Iterate over all ERROR_REGEX and check content for any matches

Enhanced with timestamp-based event collapsing:
- Extracts timestamps from matched lines
- Collapses events within interval_to_collapse_event seconds
- Prunes timestamp lists to keep first N and last N timestamps
- Skips matches that satisfy ignore_match_rules

Args:
content (str): content to match regex on
Expand All @@ -213,6 +261,7 @@ def check_all_regexes(
group (bool, optional): flag to control whether matches should be grouped together. Defaults to True.
num_timestamps (int, optional): maximum number of timestamps to keep for each event. Defaults to 3.
interval_to_collapse_event (int, optional): time interval in seconds to collapse events. Defaults to 60.
ignore_match_rules (Optional[Sequence[ParsedIgnoreMatchRule]], optional): Parsed skip rules. Defaults to None.

Returns:
list[RegexEvent]: list of regex event objects
Expand Down Expand Up @@ -246,8 +295,20 @@ def _is_within_interval(new_timestamp_str: str, existing_timestamps: list[str])
continue
return False

skip_rules = list(ignore_match_rules) if ignore_match_rules else []

for error_regex_obj in error_regex:
for match_obj in error_regex_obj.regex.finditer(content):
raw_match = match_obj.group(0)
if self._should_ignore_regex_match(
content,
match_obj.start(),
raw_match,
error_regex_obj.message,
skip_rules,
):
continue

# Extract timestamp from the line where match occurs
timestamp = self._extract_timestamp_from_match_position(content, match_obj.start())

Expand Down
10 changes: 10 additions & 0 deletions nodescraper/plugins/inband/dmesg/analyzer_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from pydantic import Field

from nodescraper.base.match_ignore import IgnoreMatchRuleSpec
from nodescraper.base.regexanalyzer import ErrorRegex
from nodescraper.models import TimeRangeAnalysisArgs

Expand Down Expand Up @@ -69,3 +70,12 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs):
"(CPU, GPU BDF/block, etc.) reaches or exceeds this value."
),
)
ignore_match_rules: Optional[list[IgnoreMatchRuleSpec]] = Field(
default=None,
description=(
"Rules that skip regex matches during analysis. Each rule may use line_regex, "
"match_regex, message, and/or mce_banks. Within a rule all specified fields must "
"match; any matching rule suppresses the hit. mce_banks accepts bank ids and "
'inclusive ranges such as "60-63".'
),
)
22 changes: 18 additions & 4 deletions nodescraper/plugins/inband/dmesg/dmesg_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import re
from typing import Optional

from nodescraper.base.match_ignore import parse_ignore_match_rules
from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer
from nodescraper.connection.inband import TextFileArtifact
from nodescraper.enums import EventCategory, EventPriority
Expand Down Expand Up @@ -641,10 +642,19 @@ def _resolve_priority(

return current_priority # if no rules are matched, keep the current priority

def _check_mce_threshold(self, dmesg_content: str, threshold: int) -> None:
def _check_mce_threshold(
self,
dmesg_content: str,
threshold: int,
ignore_mce_banks: frozenset[int],
) -> None:
"""Raise ERROR events when correctable MCE counts per component reach the threshold."""
correctable_counts = parse_correctable_mce_counts(dmesg_content)
uncorrectable_counts = parse_uncorrectable_mce_counts(dmesg_content)
correctable_counts = parse_correctable_mce_counts(
dmesg_content, ignore_banks=ignore_mce_banks
)
uncorrectable_counts = parse_uncorrectable_mce_counts(
dmesg_content, ignore_banks=ignore_mce_banks
)

for part, count in sorted(correctable_counts.items()):
if count >= threshold:
Expand Down Expand Up @@ -703,12 +713,15 @@ def analyze_data(
else:
dmesg_content = data.dmesg_content

ignore_match_rules, ignore_mce_banks = parse_ignore_match_rules(args.ignore_match_rules)

known_err_events = self.check_all_regexes(
content=dmesg_content,
source="dmesg",
error_regex=final_error_regex,
num_timestamps=args.num_timestamps,
interval_to_collapse_event=args.interval_to_collapse_event,
ignore_match_rules=ignore_match_rules,
)
if args.exclude_category:
known_err_events = [
Expand Down Expand Up @@ -738,6 +751,7 @@ def analyze_data(
error_regex=unknown_dmesg_error_regexes,
num_timestamps=args.num_timestamps,
interval_to_collapse_event=args.interval_to_collapse_event,
ignore_match_rules=ignore_match_rules,
)

for err_event in err_events:
Expand All @@ -746,6 +760,6 @@ def analyze_data(
self.result.events.append(err_event)

if args.mce_threshold is not None:
self._check_mce_threshold(dmesg_content, args.mce_threshold)
self._check_mce_threshold(dmesg_content, args.mce_threshold, ignore_mce_banks)

return self.result
113 changes: 113 additions & 0 deletions nodescraper/plugins/inband/dmesg/mce_bank_ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
###############################################################################
#
# MIT License
#
# Copyright (c) 2026 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
###############################################################################
import re
from typing import Optional, Sequence, Union

_MCE_BANK_RE = re.compile(r"\bMC(?P<bank>\d+)_STATUS\b", re.IGNORECASE)

IgnoreMceBankSpec = Union[int, str]


def parse_ignore_mce_banks(
spec: Optional[Sequence[IgnoreMceBankSpec]],
) -> frozenset[int]:
"""Expand ignore_mce_banks config entries into a set of MCA bank numbers.

Args:
spec: Bank ids, bank ranges like ``\"60-63\"``, or ``None``.

Returns:
frozenset[int]: MCA bank numbers to ignore.
"""
if not spec:
return frozenset()

banks: set[int] = set()
for entry in spec:
if isinstance(entry, int):
if entry < 0:
raise ValueError(f"Invalid MCE bank number: {entry}")
banks.add(entry)
continue

token = str(entry).strip()
if not token:
raise ValueError("Empty MCE bank ignore entry")

if "-" in token:
start_text, end_text = token.split("-", 1)
start = int(start_text.strip())
end = int(end_text.strip())
if start < 0 or end < 0 or start > end:
raise ValueError(f"Invalid MCE bank range: {entry}")
banks.update(range(start, end + 1))
continue

bank = int(token)
if bank < 0:
raise ValueError(f"Invalid MCE bank number: {entry}")
banks.add(bank)

return frozenset(banks)


def extract_mce_bank_from_line(line: str) -> Optional[int]:
"""Return the MCA bank number from a dmesg line, if present.

Args:
line: Single dmesg log line.

Returns:
Optional[int]: MCA bank number, or None when the line has no MCn_STATUS token.
"""
match = _MCE_BANK_RE.search(line)
if match is None:
return None
return int(match.group("bank"))


def filter_ignored_mce_bank_lines(content: str, ignore_banks: frozenset[int]) -> str:
"""Drop dmesg lines whose MCA bank is listed in ignore_banks.

Args:
content: Full dmesg text.
ignore_banks: MCA bank numbers to ignore.

Returns:
str: Filtered dmesg text with ignored MCA bank lines removed.
"""
if not ignore_banks:
return content

kept_lines: list[str] = []
for line in content.splitlines():
bank = extract_mce_bank_from_line(line)
if bank is not None and bank in ignore_banks:
continue
kept_lines.append(line)
if not kept_lines:
return ""
return "\n".join(kept_lines) + ("\n" if content.endswith("\n") else "")
22 changes: 19 additions & 3 deletions nodescraper/plugins/inband/dmesg/mce_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
#
###############################################################################
import re
from typing import Optional
from typing import FrozenSet, Optional

from nodescraper.base.match_ignore import extract_mce_bank_from_line

_CORRECTABLE_SUMMARY_RE = re.compile(
r"(?P<count>\d+)\s+correctable hardware errors detected in total in (?P<block>\w+) block"
Expand Down Expand Up @@ -91,14 +93,18 @@ def _gpu_index_for_bdf(bdf: str, bdf_order: list[str]) -> int:
return bdf_order.index(bdf)


def parse_correctable_mce_counts(content: str) -> dict[str, int]:
def parse_correctable_mce_counts(
content: str,
ignore_banks: Optional[FrozenSet[int]] = None,
) -> dict[str, int]:
"""Count correctable MCE / RAS hardware errors per component from dmesg text.

Handles summary lines (for example ``mce: 3 correctable ... on CPU1``),
amdgpu block summaries, and per-event ``MCn_STATUS[|CE|]`` hardware error lines.
"""
counts: dict[str, int] = {}
gpu_bdf_order: list[str] = []
ignored = ignore_banks or frozenset()

for line in content.splitlines():
gpu_match = _GPU_CORRECTABLE_RE.search(line)
Expand All @@ -123,16 +129,23 @@ def parse_correctable_mce_counts(content: str) -> dict[str, int]:

status_match = _MCE_CE_STATUS_RE.search(line)
if status_match:
bank = extract_mce_bank_from_line(line)
if bank is not None and bank in ignored:
continue
part = status_match.group("cpu") if status_match.group("cpu") else "unknown"
_add_count(counts, part, 1)

return counts


def parse_uncorrectable_mce_counts(content: str) -> dict[str, int]:
def parse_uncorrectable_mce_counts(
content: str,
ignore_banks: Optional[FrozenSet[int]] = None,
) -> dict[str, int]:
"""Count uncorrectable MCE / RAS hardware errors per component from dmesg text."""
counts: dict[str, int] = {}
gpu_bdf_order: list[str] = []
ignored = ignore_banks or frozenset()

for line in content.splitlines():
gpu_match = _GPU_UNCORRECTABLE_RE.search(line)
Expand All @@ -154,6 +167,9 @@ def parse_uncorrectable_mce_counts(content: str) -> dict[str, int]:

status_match = _MCE_UC_STATUS_RE.search(line)
if status_match:
bank = extract_mce_bank_from_line(line)
if bank is not None and bank in ignored:
continue
part = status_match.group("cpu") if status_match.group("cpu") else "unknown"
_add_count(counts, part, 1)

Expand Down
Loading
Loading