Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 51 additions & 4 deletions bugbug/commit_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.
import sys
from pathlib import Path
from typing import Sequence

import pandas as pd
Expand Down Expand Up @@ -813,6 +814,39 @@ def __call__(self, commit, **kwargs):
return commit["files"]


class FilesPathComponents(object):
name = "filespathcomponents"

def __call__(self, commit, **kwargs):
return list(
{
part
for path in commit["files"]
for part in Path(path).parts
if part not in ("/", "")
}
)


class PlatformKeywords(object):
def __init__(self, platform, keywords):
self.platform = platform
self.keywords = [kw.lower() for kw in keywords]
self.name = f"{platform} keywords"

def __call__(self, commit, **kwargs):
count = sum(
1
for path in commit["files"]
if any(
kw in part.lower() for part in Path(path).parts for kw in self.keywords
)
)
desc_lower = commit["desc"].lower()
count += sum(1 for kw in self.keywords if kw in desc_lower)
return count


def _pass_through_tokenizer(doc):
return doc

Expand Down Expand Up @@ -863,6 +897,15 @@ def __call__(self, commit, **kwargs):
return commit["types"]


class TypesCounts(object):
def __call__(self, commit, **kwargs):
counts: dict[str, int] = {}
for path in commit["files"]:
type_ = repository.get_type(path)
counts[f"{type_} count"] = counts.get(f"{type_} count", 0) + 1
return counts


def merge_metrics(objects):
metrics = {}

Expand Down Expand Up @@ -890,6 +933,7 @@ def merge_commits(commits: Sequence[repository.CommitDict]) -> repository.Commit
return repository.CommitDict(
{
"nodes": list(commit["node"] for commit in commits),
"desc": " ".join(commit["desc"] for commit in commits),
"pushdate": commits[0]["pushdate"],
"types": list(set(sum((commit["types"] for commit in commits), []))),
"files": list(set(sum((commit["files"] for commit in commits), []))),
Expand Down Expand Up @@ -965,9 +1009,12 @@ def merge_commits(commits: Sequence[repository.CommitDict]) -> repository.Commit

class CommitExtractor(BaseEstimator, TransformerMixin):
def __init__(self, feature_extractors, cleanup_functions):
assert len(set(type(fe) for fe in feature_extractors)) == len(
feature_extractors
), "Duplicate Feature Extractors"
assert len(
set(
fe.name if hasattr(fe, "name") else type(fe)
for fe in feature_extractors
)
) == len(feature_extractors), "Duplicate Feature Extractors"
self.feature_extractors = feature_extractors

assert len(set(type(cf) for cf in cleanup_functions)) == len(
Expand Down Expand Up @@ -1011,7 +1058,7 @@ def transform(self, commits):
# FIXME: This is a workaround to pass the value to the
# union transformer independently. This will be dropped when we
# resolve https://github.com/mozilla/bugbug/issues/3876
if isinstance(feature_extractor, Files):
if isinstance(feature_extractor, (Files, FilesPathComponents)):
result[sys.intern(feature_extractor_name)] = res
continue

Expand Down
8 changes: 8 additions & 0 deletions bugbug/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,8 @@ def get_human_readable_feature_names(self):
feature_name = f"Combined text contains '{feature_name}'"
elif type_ == "files":
feature_name = f"File '{feature_name}'"
elif type_ == "filespathcomponents":
feature_name = f"File path component '{feature_name}'"
elif type_ not in ("data", "couple_data"):
raise ValueError(f"Unexpected feature type for: {full_feature_name}")

Expand All @@ -226,6 +228,12 @@ def get_human_readable_feature_names(self):
return cleaned_feature_names

def get_important_features(self, cutoff, shap_values):
# In the multi-class case, we have (n_samples, n_features, n_classes) and
# we need to normalize it to (n_classes, n_samples, n_features) for the logic
# below to work.
if isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
shap_values = np.moveaxis(shap_values, -1, 0)

# returns top features for a shap_value matrix
def get_top_features(cutoff, shap_values):
# Calculate the values that represent the fraction of the model output variability attributable
Expand Down
1 change: 1 addition & 0 deletions bugbug/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"testlabelselect": "bugbug.models.testselect.TestLabelSelectModel",
"testgroupselect": "bugbug.models.testselect.TestGroupSelectModel",
"testconfiggroupselect": "bugbug.models.testselect.TestConfigGroupSelectModel",
"testconfig": "bugbug.models.testfailure.TestConfigModel",
"testfailure": "bugbug.models.testfailure.TestFailureModel",
"tracking": "bugbug.models.tracking.TrackingModel",
"uplift": "bugbug.models.uplift.UpliftModel",
Expand Down
Loading