diff --git a/bugbug/commit_features.py b/bugbug/commit_features.py index 4d7f3b0bda..f0ac448ca6 100644 --- a/bugbug/commit_features.py +++ b/bugbug/commit_features.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. import sys +from pathlib import Path from typing import Sequence import pandas as pd @@ -813,6 +814,39 @@ def __call__(self, commit, **kwargs): return commit["files"] +class FilesPathComponents(object): + name = "filespathcomponents" + + def __call__(self, commit, **kwargs): + return list( + { + part + for path in commit["files"] + for part in Path(path).parts + if part not in ("/", "") + } + ) + + +class PlatformKeywords(object): + def __init__(self, platform, keywords): + self.platform = platform + self.keywords = [kw.lower() for kw in keywords] + self.name = f"{platform} keywords" + + def __call__(self, commit, **kwargs): + count = sum( + 1 + for path in commit["files"] + if any( + kw in part.lower() for part in Path(path).parts for kw in self.keywords + ) + ) + desc_lower = commit["desc"].lower() + count += sum(1 for kw in self.keywords if kw in desc_lower) + return count + + def _pass_through_tokenizer(doc): return doc @@ -863,6 +897,15 @@ def __call__(self, commit, **kwargs): return commit["types"] +class TypesCounts(object): + def __call__(self, commit, **kwargs): + counts: dict[str, int] = {} + for path in commit["files"]: + type_ = repository.get_type(path) + counts[f"{type_} count"] = counts.get(f"{type_} count", 0) + 1 + return counts + + def merge_metrics(objects): metrics = {} @@ -890,6 +933,7 @@ def merge_commits(commits: Sequence[repository.CommitDict]) -> repository.Commit return repository.CommitDict( { "nodes": list(commit["node"] for commit in commits), + "desc": " ".join(commit["desc"] for commit in commits), "pushdate": commits[0]["pushdate"], "types": list(set(sum((commit["types"] for commit in commits), []))), "files": list(set(sum((commit["files"] for commit in commits), []))), @@ -965,9 +1009,12 @@ def merge_commits(commits: Sequence[repository.CommitDict]) -> repository.Commit class CommitExtractor(BaseEstimator, TransformerMixin): def __init__(self, feature_extractors, cleanup_functions): - assert len(set(type(fe) for fe in feature_extractors)) == len( - feature_extractors - ), "Duplicate Feature Extractors" + assert len( + set( + fe.name if hasattr(fe, "name") else type(fe) + for fe in feature_extractors + ) + ) == len(feature_extractors), "Duplicate Feature Extractors" self.feature_extractors = feature_extractors assert len(set(type(cf) for cf in cleanup_functions)) == len( @@ -1011,7 +1058,7 @@ def transform(self, commits): # FIXME: This is a workaround to pass the value to the # union transformer independently. This will be dropped when we # resolve https://github.com/mozilla/bugbug/issues/3876 - if isinstance(feature_extractor, Files): + if isinstance(feature_extractor, (Files, FilesPathComponents)): result[sys.intern(feature_extractor_name)] = res continue diff --git a/bugbug/model.py b/bugbug/model.py index 58255f81ed..9616ec19ed 100644 --- a/bugbug/model.py +++ b/bugbug/model.py @@ -218,6 +218,8 @@ def get_human_readable_feature_names(self): feature_name = f"Combined text contains '{feature_name}'" elif type_ == "files": feature_name = f"File '{feature_name}'" + elif type_ == "filespathcomponents": + feature_name = f"File path component '{feature_name}'" elif type_ not in ("data", "couple_data"): raise ValueError(f"Unexpected feature type for: {full_feature_name}") @@ -226,6 +228,12 @@ def get_human_readable_feature_names(self): return cleaned_feature_names def get_important_features(self, cutoff, shap_values): + # In the multi-class case, we have (n_samples, n_features, n_classes) and + # we need to normalize it to (n_classes, n_samples, n_features) for the logic + # below to work. + if isinstance(shap_values, np.ndarray) and shap_values.ndim == 3: + shap_values = np.moveaxis(shap_values, -1, 0) + # returns top features for a shap_value matrix def get_top_features(cutoff, shap_values): # Calculate the values that represent the fraction of the model output variability attributable diff --git a/bugbug/models/__init__.py b/bugbug/models/__init__.py index eb1a66a84f..8b5468e78a 100644 --- a/bugbug/models/__init__.py +++ b/bugbug/models/__init__.py @@ -33,6 +33,7 @@ "testlabelselect": "bugbug.models.testselect.TestLabelSelectModel", "testgroupselect": "bugbug.models.testselect.TestGroupSelectModel", "testconfiggroupselect": "bugbug.models.testselect.TestConfigGroupSelectModel", + "testconfig": "bugbug.models.testfailure.TestConfigModel", "testfailure": "bugbug.models.testfailure.TestFailureModel", "tracking": "bugbug.models.tracking.TrackingModel", "uplift": "bugbug.models.uplift.UpliftModel", diff --git a/bugbug/models/testfailure.py b/bugbug/models/testfailure.py index 0f84a5121c..ea890de981 100644 --- a/bugbug/models/testfailure.py +++ b/bugbug/models/testfailure.py @@ -4,6 +4,7 @@ # You can obtain one at http://mozilla.org/MPL/2.0/. import logging +from typing import Optional import xgboost from imblearn.pipeline import Pipeline as ImblearnPipeline @@ -12,14 +13,44 @@ from sklearn.feature_extraction import DictVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline +from sklearn.utils.class_weight import compute_sample_weight -from bugbug import commit_features, repository, test_scheduling, utils +from bugbug import ( + commit_features, + db, + feature_cleanup, + repository, + test_scheduling, + utils, +) from bugbug.model import CommitModel logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +class _BalancedXGBClassifier(xgboost.XGBClassifier): + def fit(self, X, y, **fit_params): + fit_params["sample_weight"] = compute_sample_weight("balanced", y) + return super().fit(X, y, **fit_params) + + +PLATFORM_KEYWORDS = ( + (("linux",), "linux"), + (("windows", "win"), "windows"), + (("android", "apk", "fenix", "focus", "klar"), "android"), + (("macosx",), "mac"), +) + + +def get_platform(task_name: str) -> Optional[str]: + config = task_name.split("/")[0] + for keywords, platform in PLATFORM_KEYWORDS: + if any(k in config for k in keywords): + return platform + return None + + class TestFailureModel(CommitModel): def __init__(self, lemmatization=False): CommitModel.__init__(self, lemmatization) @@ -36,12 +67,7 @@ def __init__(self, lemmatization=False): commit_features.SourceCodeDeleted(), commit_features.OtherDeleted(), commit_features.TestDeleted(), - # commit_features.author_experience(), - # commit_features.reviewer_experience(), commit_features.ReviewersNum(), - # commit_features.component_touched_prev(), - # commit_features.directory_touched_prev(), - # commit_features.file_touched_prev(), commit_features.Types(), commit_features.Files(), commit_features.Components(), @@ -74,7 +100,7 @@ def __init__(self, lemmatization=False): CountVectorizer( analyzer=utils.keep_as_is, lowercase=False, - min_df=0.0014, + min_df=0.005, ), "files", ), @@ -134,3 +160,205 @@ def get_labels(self): def get_feature_names(self): return self.clf.named_steps["union"].get_feature_names_out() + + +class TestConfigModel(CommitModel): + def train_test_split(self, X, y): + from sklearn.model_selection import train_test_split + + return train_test_split(X, y, test_size=0.1, random_state=0) + + def __init__(self, lemmatization=False): + CommitModel.__init__(self, lemmatization) + + self.training_dbs.append(test_scheduling.TEST_LABEL_SCHEDULING_DB) + self.training_dbs.append(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB) + + feature_extractors = [ + commit_features.SourceCodeFileSize(), + commit_features.OtherFileSize(), + commit_features.TestFileSize(), + commit_features.SourceCodeAdded(), + commit_features.OtherAdded(), + commit_features.TestAdded(), + commit_features.SourceCodeDeleted(), + commit_features.OtherDeleted(), + commit_features.TestDeleted(), + commit_features.Types(), + commit_features.TypesCounts(), + commit_features.FilesPathComponents(), + commit_features.Components(), + commit_features.ComponentsModifiedNum(), + commit_features.DirectoriesModifiedNum(), + commit_features.SourceCodeFilesModifiedNum(), + commit_features.OtherFilesModifiedNum(), + commit_features.TestFilesModifiedNum(), + commit_features.SourceCodeFileMetrics(), + commit_features.PlatformKeywords("mac", ["cocoa", "mac"]), + commit_features.PlatformKeywords( + "windows", ["wmf", "winlauncher", "dxgi", "hresult", "playready"] + ), + commit_features.PlatformKeywords("linux", ["vulkan", "wayland"]), + commit_features.PlatformKeywords("android", ["geckoview", "fenix"]), + ] + + cleanup_functions = [ + feature_cleanup.fileref(), + feature_cleanup.url(), + feature_cleanup.synonyms(), + ] + + self.extraction_pipeline = Pipeline( + [ + ( + "commit_extractor", + commit_features.CommitExtractor( + feature_extractors, cleanup_functions + ), + ), + ] + ) + + self.clf = ImblearnPipeline( + [ + ( + "union", + ColumnTransformer( + [ + ("data", DictVectorizer(), "data"), + ( + "filespathcomponents", + CountVectorizer( + analyzer=utils.keep_as_is, + lowercase=False, + min_df=0.01, + ), + "filespathcomponents", + ), + ( + "desc", + self.text_vectorizer(min_df=0.01, stop_words="english"), + "desc", + ), + ] + ), + ), + ( + "estimator", + _BalancedXGBClassifier( + learning_rate=0.05, + n_estimators=200, + subsample=1.0, + colsample_bytree=0.7, + reg_alpha=0.5, + max_depth=4, + n_jobs=utils.get_physical_cpu_count(), + ), + ), + ] + ) + + def _get_label_db_data(self): + all_revs = set() + platforms = {} + for revs, test_datas in test_scheduling.get_test_scheduling_history("label"): + rev = revs[0] + all_revs.add(rev) + has_likely = any(td["is_likely_regression"] for td in test_datas) + failing_platforms = set() + for td in test_datas: + if has_likely: + if not td["is_likely_regression"]: + continue + else: + if not td["is_possible_regression"]: + continue + if "-test-verify" in td["name"]: + continue + platform = get_platform(td["name"]) + if platform is not None: + failing_platforms.add(platform) + if failing_platforms: + platforms[rev] = failing_platforms + return all_revs, platforms + + def _get_cg_db_data(self): + all_revs = set() + platforms = {} + for revisions, _, _, possible_regressions, likely_regressions in db.read( + test_scheduling.PUSH_DATA_CONFIG_GROUP_DB + ): + rev = revisions[0] + all_revs.add(rev) + regressions = ( + likely_regressions if likely_regressions else possible_regressions + ) + failing_platforms = set() + for config, group in regressions: + platform = get_platform(config) + if platform is not None: + failing_platforms.add(platform) + if failing_platforms: + platforms[rev] = failing_platforms + return all_revs, platforms + + def _get_rev_to_revisions(self, classes): + rev_to_revisions = {} + for revs, _ in test_scheduling.get_test_scheduling_history("label"): + if revs[0] in classes: + rev_to_revisions[revs[0]] = revs + for revisions, _, _, _, _ in db.read(test_scheduling.PUSH_DATA_CONFIG_GROUP_DB): + if revisions[0] in classes and revisions[0] not in rev_to_revisions: + rev_to_revisions[revisions[0]] = revisions + return rev_to_revisions + + def get_labels(self): + label_all_revs, label_db_platforms = self._get_label_db_data() + cg_all_revs, cg_db_platforms = self._get_cg_db_data() + + both_analyzed = label_all_revs & cg_all_revs + + classes = {} + for rev in both_analyzed: + platforms = label_db_platforms.get(rev, set()) | cg_db_platforms.get( + rev, set() + ) + if not platforms: + continue + classes[rev] = next(iter(platforms)) if len(platforms) == 1 else "any" + + class_names = sorted(set(classes.values())) + + logger.info("%d pushes considered", len(classes)) + for label in class_names: + logger.info( + "%d pushes with label '%s'", + sum(1 for lbl in classes.values() if lbl == label), + label, + ) + + return classes, class_names + + def items_gen(self, classes): + rev_to_revisions = self._get_rev_to_revisions(classes) + + needed_revisions = set() + for revisions in rev_to_revisions.values(): + needed_revisions.update(revisions) + + commit_map = {} + for commit in repository.get_commits(): + if commit["node"] in needed_revisions: + commit_map[commit["node"]] = commit + + assert len(commit_map) > 0 + + for rev, revisions in rev_to_revisions.items(): + commits = tuple(commit_map[r] for r in revisions if r in commit_map) + if not commits: + continue + commit_data = commit_features.merge_commits(commits) + yield commit_data, classes[rev] + + def get_feature_names(self): + return self.clf.named_steps["union"].get_feature_names_out() diff --git a/infra/data-pipeline.yml b/infra/data-pipeline.yml index 78a378900c..4c8c01baee 100644 --- a/infra/data-pipeline.yml +++ b/infra/data-pipeline.yml @@ -1263,6 +1263,43 @@ tasks: owner: bugbug-team@mozilla.com source: ${repository}/raw/master/data-pipeline.yml + - ID: train-test-config + created: { $fromNow: "" } + deadline: { $fromNow: "5 days" } + expires: { $fromNow: "1 year" } + provisionerId: proj-bugbug + workerType: compute-large + dependencies: + - commit-retrieval + - test-label-scheduling-history-generator + - test-config_group-scheduling-history-push_data-retrieval + payload: + maxRunTime: 25200 + image: mozilla/bugbug-base:${version} + command: + - bugbug-train + - testconfig + + artifacts: + public/testconfigmodel.tar.zst: + path: /testconfigmodel.tar.zst + type: file + public/metrics.json: + path: /metrics.json + type: file + + routes: + - notify.email.bugbug-team@mozilla.com.on-failed + - index.project.bugbug.train_testconfig.${version} + - index.project.bugbug.train_testconfig.per_version.${version}.${year}.${month}.${day}.${hour}.${minute}.${second} + - index.project.bugbug.train_testconfig.per_date.${year}.${month}.${day}.${hour}.${minute}.${second}.${version} + - index.project.bugbug.train_testconfig.latest + metadata: + name: bugbug train testconfig model + description: bugbug train testconfig model + owner: bugbug-team@mozilla.com + source: ${repository}/raw/master/data-pipeline.yml + - ID: train-needsdiagnosis created: { $fromNow: "" } deadline: { $fromNow: "3 days" } @@ -1454,6 +1491,7 @@ tasks: - train-test-label-select - train-test-group-select - train-test-failure + - train-test-config - train-needsdiagnosis - train-accessibility - train-performancebug @@ -1494,6 +1532,7 @@ tasks: - train-spambug - train-test-label-select - train-test-group-select + - train-test-config - train-needsdiagnosis - train-accessibility - train-performancebug