From a1c1a0ef0773d1921058de760d00b65b6b395472 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Jun 2026 16:47:42 +0530 Subject: [PATCH 01/10] Move --ignore functionality to codebase creation Deprecate --ingore and --include pre-scan plugins and move the ignore/include functionality to codebase import stage to get rid of multiple codebase walks. Signed-off-by: Ayan Sinha Mahapatra --- pyproject-scancode-toolkit-mini.toml | 1 - pyproject-scancode-toolkit.toml | 1 - pyproject.toml | 1 - src/commoncode/resource.py | 85 ++++++++++++--- src/scancode/cli.py | 34 +++++- src/scancode/plugin_ignore.py | 138 ++++++++++-------------- tests/commoncode/test_fileset.py | 5 + tests/scancode/data/help/help_linux.txt | 4 +- tests/scancode/test_plugin_ignore.py | 29 +---- 9 files changed, 170 insertions(+), 128 deletions(-) diff --git a/pyproject-scancode-toolkit-mini.toml b/pyproject-scancode-toolkit-mini.toml index 06ad0bfdd3f..f0a5696872e 100644 --- a/pyproject-scancode-toolkit-mini.toml +++ b/pyproject-scancode-toolkit-mini.toml @@ -256,7 +256,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/pyproject-scancode-toolkit.toml b/pyproject-scancode-toolkit.toml index ade84a60c2f..6ee2048f76d 100644 --- a/pyproject-scancode-toolkit.toml +++ b/pyproject-scancode-toolkit.toml @@ -257,7 +257,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/pyproject.toml b/pyproject.toml index 8c90c9078db..a74f58405a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -262,7 +262,6 @@ scancode-train-gibberish-model = "textcode.train_gibberish_model:train_gibberish # scancode_pre_scan is the entry point for pre_scan plugins executed before the # scans. See also plugincode.pre_scan module for details and doc. [project.entry-points.scancode_pre_scan] -ignore = "scancode.plugin_ignore:ProcessIgnore" facet = "summarycode.facet:AddFacet" diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index d19e2da842d..19bfa955e36 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -39,6 +39,7 @@ from commoncode.datautils import List from commoncode.datautils import Mapping from commoncode.datautils import String +from commoncode.fileset import is_included from commoncode.filetype import is_file as filetype_is_file from commoncode.filetype import is_special from commoncode.fileutils import as_posixpath @@ -62,7 +63,7 @@ # Tracing flags TRACE = False -TRACE_DEEP = False +TRACE_DEEP = True def logger_debug(*args): @@ -98,7 +99,7 @@ def skip_ignored(location): if TRACE_DEEP: logger_debug() logger_debug( - "Codebase.populate: walk: ignored loc:", + "Codebase.populate: walk: skip_ignored:", location, "ignored:", ignored(location), @@ -109,6 +110,42 @@ def skip_ignored(location): return is_special(location) or ignored(location) +def is_ignored(location, includes=None, excludes=None): + + excludes = { + pattern: 'User ignore: Supplied by --ignore' for pattern in excludes + } + + includes = { + pattern: 'User include: Supplied by --include' for pattern in includes + } + + included_from_options = is_included( + path=location, + includes=includes, + excludes=excludes, + ) + + if TRACE_DEEP: + logger_debug( + "Codebase.populate: walk: is_ignored:", + "is_ignored: location:", + location, + "included_from_options:", + included_from_options, + "skip_ignored", + skip_ignored(location) + ) + + if skip_ignored(location) or not included_from_options: + if TRACE_DEEP: + logger_debug("is_ignored: location:", location, "is_skipped",) + + return True + + return False + + def depth_walk( root_location, max_depth, @@ -202,6 +239,8 @@ class Codebase: __slots__ = ( "max_depth", "location", + "includes", + "ignores", "has_single_resource", "resource_attributes", "resource_class", @@ -236,6 +275,8 @@ def __init__( max_in_memory=10000, max_depth=0, paths=tuple(), + ignores=tuple(), + includes=tuple(), *args, **kwargs, ): @@ -298,6 +339,8 @@ def __init__( # finally populate self.paths = self._prepare_clean_paths(paths) + self.ignores = ignores + self.includes = includes self._populate() def _prepare_clean_paths(self, paths=tuple()): @@ -461,11 +504,17 @@ def _populate(self): return if self.paths: - return self._create_resources_from_paths(root=root, paths=self.paths) + # In case of a list of full paths, we create resources without walking + return self._create_resources_from_full_paths(root=root, paths=self.paths) + # In case we have multiple else: - return self._create_resources_from_root(root=root) + return self._create_resources_from_root( + root=root, + includes=self.includes, + ignores=self.ignores, + ) - def _create_resources_from_paths(self, root, paths): + def _create_resources_from_full_paths(self, root, paths): # without paths we iterate the provided paths. We report an error # if a path is missing on disk. @@ -483,22 +532,21 @@ def _create_resources_from_paths(self, root, paths): msg = f"ERROR: cannot populate codebase: path: {path!r} not found in {res_loc!r}" self.errors.append(msg) raise Exception(path, join(base_location, path)) - continue # create all parents. The last parent is the one we want to use parent = root if TRACE: - logger_debug("Codebase._create_resources_from_paths: parent", parent) + logger_debug("Codebase._create_resources_from_full_paths: parent", parent) for parent_path in get_ancestor_paths(path, include_self=False): if TRACE: logger_debug( - f" Codebase._create_resources_from_paths: parent_path: {parent_path!r}" + f" Codebase._create_resources_from_full_paths: parent_path: {parent_path!r}" ) if not parent_path: continue newpar = parents_by_path.get(parent_path) if TRACE: - logger_debug(" Codebase._create_resources_from_paths: newpar", repr(newpar)) + logger_debug(" Codebase._create_resources_from_full_paths: newpar", repr(newpar)) if not newpar: newpar = self._get_or_create_resource( @@ -509,7 +557,7 @@ def _create_resources_from_paths(self, root, paths): ) if not newpar: raise Exception( - "ERROR: Codebase._create_resources_from_paths:" + "ERROR: Codebase._create_resources_from_full_paths:" f" cannot create parent for: {parent_path!r}" ) parent = newpar @@ -518,7 +566,7 @@ def _create_resources_from_paths(self, root, paths): if TRACE: logger_debug( - f" Codebase._create_resources_from_paths:", + f" Codebase._create_resources_from_full_paths:", f"created newpar: {newpar!r}", ) @@ -529,10 +577,10 @@ def _create_resources_from_paths(self, root, paths): is_file=isfile(res_loc), ) if TRACE: - logger_debug("Codebase._create_resources_from_paths: resource", res) + logger_debug("Codebase._create_resources_from_full_paths: resource", res) - def _create_resources_from_root(self, root): - # without paths we walks the root location top-down + def _create_resources_from_root(self, root, includes, ignores): + # without paths we walk the root location top-down # track resources parents by location during construction. # NOTE: this cannot exhaust memory on a large codebase, because we do @@ -545,9 +593,15 @@ def err(_error): f"ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}" ) + skip_ignored = partial(is_ignored, includes=includes, excludes=ignores) + + if TRACE_DEEP: + logger_debug(f"parents_by_loc: {parents_by_loc}, ignores: {ignores}, includes: {includes}") + # Walk over the directory and build the resource tree for top, dirs, files in depth_walk( root_location=root.location, + skip_ignored=skip_ignored, max_depth=self.max_depth, error_handler=err, ): @@ -557,6 +611,7 @@ def err(_error): top=top, dirs=dirs, files=files, + skip_ignored=skip_ignored, ): # on the plain, bare FS, files cannot be parents if not created.is_file: @@ -574,6 +629,8 @@ def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored) for name in names: location = join(top, name) if skip_ignored(location): + if TRACE_DEEP: + logger_debug(f"_create_resources, depth_walk loop: ignored location: {location}") continue res = self._get_or_create_resource( name=name, diff --git a/src/scancode/cli.py b/src/scancode/cli.py index 1376c6cfee9..ca4d279eb0e 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -221,6 +221,26 @@ def default_processes(): callback=validate_input_path, type=click.Path(exists=True, readable=True, path_type=str)) +@click.option('--include', + multiple=True, + default=None, + metavar='', + help='Include files matching .', + sort_order=11, + help_group=cliutils.CORE_GROUP, + cls=PluggableCommandLineOption, +) + +@click.option('--ignore', + multiple=True, + default=None, + metavar='', + help='Ignore files matching .', + sort_order=10, + help_group=cliutils.CORE_GROUP, + cls=PluggableCommandLineOption, +) + @click.option('--strip-root', is_flag=True, default=False, @@ -395,6 +415,8 @@ def default_processes(): def scancode( ctx, input, # NOQA + include, + ignore, strip_root, full_root, processes, @@ -505,6 +527,8 @@ def scancode( # run proper success, _results = run_scan( input=input, + include=include, + ignore=ignore, from_json=from_json, strip_root=strip_root, full_root=full_root, @@ -545,7 +569,9 @@ def scancode( def run_scan( - input, # NOQA + input, # + include=[], + ignore=[], from_json=False, strip_root=False, full_root=False, @@ -644,12 +670,10 @@ def echo_func(*_args, **_kwargs): # and we craft a list of synthetic --include path pattern options from # the input list of paths included_paths = [as_posixpath(path).rstrip('/') for path in input] - # FIXME: this is a hack as this "include" is from an external plugin!!! - include = list(requested_options.get('include', []) or []) include.extend(included_paths) - requested_options['include'] = include # ... and use the common prefix as our new input + # FIXME: we should not walk outside inputs input = common_prefix # NOQA # build mappings of all options to pass down to plugins @@ -894,6 +918,8 @@ def echo_func(*_args, **_kwargs): try: codebase = codebase_class( location=input, + includes=include, + ignores=ignore, resource_attributes=resource_attributes, codebase_attributes=codebase_attributes, full_root=full_root, diff --git a/src/scancode/plugin_ignore.py b/src/scancode/plugin_ignore.py index 70b0e30b10b..3b1b3a06ed0 100644 --- a/src/scancode/plugin_ignore.py +++ b/src/scancode/plugin_ignore.py @@ -37,87 +37,63 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args)) -@pre_scan_impl -class ProcessIgnore(PreScanPlugin): +def process_codebase(codebase, ignore=(), include=(), **kwargs): """ - Include or ignore files matching patterns. + WARNING: DEPRECATED, ignore/include moved to codebase import + step in core plugins. + Keep only included and non-ignored Resources in the codebase. """ - options = [ - PluggableCommandLineOption(('--ignore',), - multiple=True, - default=None, - metavar='', - help='Ignore files matching .', - sort_order=10, - help_group=PRE_SCAN_GROUP), - PluggableCommandLineOption(('--include',), - multiple=True, - default=None, - metavar='', - help='Include files matching .', - sort_order=11, - help_group=PRE_SCAN_GROUP) - ] - - def is_enabled(self, ignore, include, **kwargs): - return ignore or include - - def process_codebase(self, codebase, ignore=(), include=(), **kwargs): - """ - Keep only included and non-ignored Resources in the codebase. - """ - - if not (ignore or include): - return - - excludes = { - pattern: 'User ignore: Supplied by --ignore' for pattern in ignore - } - - includes = { - pattern: 'User include: Supplied by --include' for pattern in include - } - - included = partial(is_included, includes=includes, excludes=excludes) - - paths_to_remove = set() - paths_to_remove_add = paths_to_remove.add - paths_to_remove_discard = paths_to_remove.discard - - # Walk codebase top-down to collect the paths of Resources to remove. - for resource in codebase.walk(topdown=True): - if resource.is_root: - continue - - resource_path = resource.path - - if not included(resource_path): - for child in resource.children(codebase): - paths_to_remove_add(child.path) - paths_to_remove_add(resource_path) - else: - # we may have been selected for removal based on a parent dir - # but may be explicitly included. Honor that - paths_to_remove_discard(resource_path) - - if TRACE: - logger_debug('process_codebase: paths_to_remove') - logger_debug(paths_to_remove) - for path in sorted(paths_to_remove): - logger_debug(codebase.get_resource(path)) - - remove_resource = codebase.remove_resource - - # Then, walk bottom-up and remove the non-included Resources from the - # Codebase if the Resource path is in our list of paths to remove. - for resource in codebase.walk(topdown=False): - resource_path = resource.path - if resource.is_root: - continue - # removing dirs will also remove its files - if resource.is_dir: - continue - if resource_path in paths_to_remove: - paths_to_remove_discard(resource_path) - remove_resource(resource) + if not (ignore or include): + return + + excludes = { + pattern: 'User ignore: Supplied by --ignore' for pattern in ignore + } + + includes = { + pattern: 'User include: Supplied by --include' for pattern in include + } + + included = partial(is_included, includes=includes, excludes=excludes) + + paths_to_remove = set() + paths_to_remove_add = paths_to_remove.add + paths_to_remove_discard = paths_to_remove.discard + + # Walk codebase top-down to collect the paths of Resources to remove. + for resource in codebase.walk(topdown=True): + if resource.is_root: + continue + + resource_path = resource.path + + if not included(resource_path): + for child in resource.children(codebase): + paths_to_remove_add(child.path) + paths_to_remove_add(resource_path) + else: + # we may have been selected for removal based on a parent dir + # but may be explicitly included. Honor that + paths_to_remove_discard(resource_path) + + if TRACE: + logger_debug('process_codebase: paths_to_remove') + logger_debug(paths_to_remove) + for path in sorted(paths_to_remove): + logger_debug(codebase.get_resource(path)) + + remove_resource = codebase.remove_resource + + # Then, walk bottom-up and remove the non-included Resources from the + # Codebase if the Resource path is in our list of paths to remove. + for resource in codebase.walk(topdown=False): + resource_path = resource.path + if resource.is_root: + continue + # removing dirs will also remove its files + if resource.is_dir: + continue + if resource_path in paths_to_remove: + paths_to_remove_discard(resource_path) + remove_resource(resource) diff --git a/tests/commoncode/test_fileset.py b/tests/commoncode/test_fileset.py index 2c6e5ef72aa..4d5f639b99d 100644 --- a/tests/commoncode/test_fileset.py +++ b/tests/commoncode/test_fileset.py @@ -55,6 +55,11 @@ def test_is_included_is_included_exclusions_2(self): assert fileset.is_included("/some/src/this/that", incs, excs) assert not fileset.is_included("/src/dist/build/mylib.so", incs, excs) + def test_is_included_is_included_inside_exclusions(self): + incs = {"/src/*.so": ".scanignore"} + excs = {"/src/*": ".scanignore"} + assert not fileset.is_included("/src/dist/build/mylib.so", incs, excs) + def test_is_included_empty_exclusions(self): incs = {"/src/*": ".scanignore"} excs = {"": ".scanignore"} diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 6794b19d602..855b7c7959d 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -94,8 +94,6 @@ Options: such that all paths have a common root directory. pre-scan: - --ignore Ignore files matching . - --include Include files matching . --facet = Add the to files with a path matching . @@ -140,11 +138,13 @@ Options: which are todo items and needs manual review. core: + --ignore Ignore files matching . --timeout Stop an unfinished file scan after a timeout in seconds. [default: 120 seconds] -n, --processes INT Set the number of parallel processes to use. Disable parallel processing if 0. Also disable threading if -1. [default: (number of CPUs)-1] + --include Include files matching . -q, --quiet Do not print summary or progress. -v, --verbose Print progress as file-by-file path instead of a progress bar. Print verbose scan counters. diff --git a/tests/scancode/test_plugin_ignore.py b/tests/scancode/test_plugin_ignore.py index 78f2954d76b..db739db88a7 100644 --- a/tests/scancode/test_plugin_ignore.py +++ b/tests/scancode/test_plugin_ignore.py @@ -14,7 +14,6 @@ from commoncode.fileset import is_included from scancode.cli_test_utils import run_scan_click from scancode.cli_test_utils import load_json_result -from scancode.plugin_ignore import ProcessIgnore from commoncode.resource import Codebase @@ -48,15 +47,13 @@ def test_is_included_glob_file(self): assert not is_included(location, excludes=excludes) def check_ProcessIgnore(self, test_dir, expected, ignore, include=()): - codebase = Codebase(test_dir) - test_plugin = ProcessIgnore() - test_plugin.process_codebase(codebase, ignore=ignore, include=include) + codebase = Codebase(location=test_dir, ignores=ignore, includes=include) resources = [res.strip_root_path for res in codebase.walk(skip_root=True)] assert sorted(resources) == expected def test_ProcessIgnore_with_single_file(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - ignore = ('sample.doc',) + ignore = ('*sample.doc',) expected = [ 'user', 'user/ignore.doc', @@ -69,7 +66,7 @@ def test_ProcessIgnore_with_single_file(self): def test_ProcessIgnore_with_multiple_files(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - ignore = ('ignore.doc', 'sample.doc',) + ignore = ('*ignore.doc', '*sample.doc',) expected = [ 'user', 'user/src', @@ -111,25 +108,10 @@ def test_ProcessIgnore_with_multiple_ignores(self): ] self.check_ProcessIgnore(test_dir, expected, ignore) - def test_ProcessIgnore_include_with_glob_for_extension(self): - test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - include = ('*.doc',) - expected = [ - 'user', - 'user/ignore.doc', - 'user/src', - 'user/src/ignore.doc', - 'user/src/test', - 'user/src/test/sample.doc', - ] - self.check_ProcessIgnore(test_dir, expected, ignore=(), include=include) - def test_ProcessIgnore_process_codebase_does_not_fail_to_access_an_ignored_resourced_cached_to_disk(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') - codebase = Codebase(test_dir, max_in_memory=1) - test_plugin = ProcessIgnore() ignore = ['test'] - test_plugin.process_codebase(codebase, ignore=ignore) + Codebase(location=test_dir, max_in_memory=1, ignores=ignore) class TestScanPluginIgnoreFiles(FileDrivenTesting): @@ -241,7 +223,7 @@ def test_scancode_multiple_ignores(self): def test_scancode_codebase_attempt_to_access_an_ignored_resourced_cached_to_disk(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') result_file = self.get_temp_file('json') - args = ['--copyright', '--strip-root', '--ignore', 'test', test_dir, '--max-in-memory', '1', '--json', result_file] + args = ['--copyright', '--strip-root', '--ignore', '*test', test_dir, '--max-in-memory', '1', '--json', result_file] run_scan_click(args) scan_result = load_json_result(result_file) assert scan_result['headers'][0]['extra_data']['files_count'] == 2 @@ -251,6 +233,5 @@ def test_scancode_codebase_attempt_to_access_an_ignored_resourced_cached_to_disk u'user/ignore.doc', u'user/src', u'user/src/ignore.doc', - u'user/src/test', ] assert scan_locs == expected From 8e88b9426129d26fac2ec93106e502cb60a89f89 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 2 Jun 2026 18:50:21 +0530 Subject: [PATCH 02/10] Do not codebase walk outside input paths For multiple inputs, do not walk outside the input paths, from their common prefix. Instead create only the directory relationships between the common prefix and input paths and start the codebase walk from every input paths. Also deprecate --include options to only support ignoring paths through path patterns. Signed-off-by: Ayan Sinha Mahapatra --- src/commoncode/resource.py | 82 ++++++++++++++++++------- src/commoncode/testcase.py | 7 ++- src/scancode/cli.py | 41 +++++-------- tests/commoncode/test_resource.py | 2 +- tests/scancode/data/help/help_linux.txt | 1 - tests/scancode/test_cli.py | 13 +++- 6 files changed, 95 insertions(+), 51 deletions(-) diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 19bfa955e36..16e17d4ca90 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -63,7 +63,7 @@ # Tracing flags TRACE = False -TRACE_DEEP = True +TRACE_DEEP = False def logger_debug(*args): @@ -110,7 +110,7 @@ def skip_ignored(location): return is_special(location) or ignored(location) -def is_ignored(location, includes=None, excludes=None): +def is_ignored(location, includes=tuple(), excludes=tuple()): excludes = { pattern: 'User ignore: Supplied by --ignore' for pattern in excludes @@ -339,8 +339,8 @@ def __init__( # finally populate self.paths = self._prepare_clean_paths(paths) + self.includes = self._prepare_clean_paths(includes) self.ignores = ignores - self.includes = includes self._populate() def _prepare_clean_paths(self, paths=tuple()): @@ -593,30 +593,48 @@ def err(_error): f"ERROR: cannot populate codebase: {_error}\n{traceback.format_exc()}" ) - skip_ignored = partial(is_ignored, includes=includes, excludes=ignores) + # ignore creating resources based on path patterns + skip_ignored = partial(is_ignored, excludes=ignores) if TRACE_DEEP: logger_debug(f"parents_by_loc: {parents_by_loc}, ignores: {ignores}, includes: {includes}") - # Walk over the directory and build the resource tree - for top, dirs, files in depth_walk( - root_location=root.location, - skip_ignored=skip_ignored, - max_depth=self.max_depth, - error_handler=err, - ): - parent = parents_by_loc.pop(top) - for created in self._create_resources( - parent=parent, - top=top, - dirs=dirs, - files=files, - skip_ignored=skip_ignored, + # in the case of a single input location, walking starts from + # the root and only the root location + if not includes: + includes = [root.location] + else: + # create the directory resources between the common + # prefix and the included locations so that they are + # connected to the root + for created in self._create_resources_common_prefix_to_inputs( + root=root, + includes=includes, ): - # on the plain, bare FS, files cannot be parents if not created.is_file: parents_by_loc[created.location] = created + # we start walking through all the input locations + for included_location in includes: + # Walk over the directory and build the resource tree + for top, dirs, files in depth_walk( + root_location=included_location, + skip_ignored=skip_ignored, + max_depth=self.max_depth, + error_handler=err, + ): + parent = parents_by_loc.pop(top) + for created in self._create_resources( + parent=parent, + top=top, + dirs=dirs, + files=files, + skip_ignored=skip_ignored, + ): + # on the plain, bare FS, files cannot be parents + if not created.is_file: + parents_by_loc[created.location] = created + def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored): """ Create and yield ``files`` and ``dirs`` children Resources of a @@ -641,6 +659,28 @@ def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored) logger_debug("Codebase.create_resources:", res) yield res + def _create_resources_common_prefix_to_inputs(self, root, includes): + + if TRACE_DEEP: + logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}") + + for included_path in includes: + _, _, extra_dir_path = included_path.rpartition(root.location) + extra_dirs = extra_dir_path.strip("/").split("/") + if TRACE_DEEP: + logger_debug(f"_create_resources_common_prefix_to_inputs: root:{root.location}, includes: {includes}") + + dir_resource = root + for dir_segment in extra_dirs: + dir_resource = self._get_or_create_resource( + name=dir_segment, + parent=dir_resource, + is_file=False, + ) + if TRACE: + logger_debug("Codebase.create_resources:", dir_resource) + yield dir_resource + def _create_root_resource(self): """ Create and return the root Resource of this codebase. @@ -1606,8 +1646,8 @@ def clean_path(path): Return a cleaned and normalized POSIX ``path``. """ path = path or "" - # convert to posix and ensure we have no slash at both ends - path = posixpath_normpath(path.replace("\\", "/").strip("/")) + # convert to posix and ensure we have no slash at the end + path = posixpath_normpath(path.replace("\\", "/").rstrip("/")) if path == ".": path = "" return path diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py index 2a7b37a9923..3857e2e5f46 100644 --- a/src/commoncode/testcase.py +++ b/src/commoncode/testcase.py @@ -92,7 +92,7 @@ class FileDrivenTesting(object): test_data_dir = None - def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True): + def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True, relative=False): """ Given a `test_path` relative to the self.test_data_dir directory, return the location to a test file or directory for this path. Copy to a temp @@ -128,6 +128,11 @@ def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True): # cleanup of VCS that could be left over from checkouts self.remove_vcs(target_dir) test_loc = target_dir + + if relative: + _, _, rel_test_loc = test_loc.rpartition(os.getcwd()) + return rel_test_loc.strip("/") + return test_loc def get_temp_file(self, extension=None, dir_name="td", file_name="tf"): diff --git a/src/scancode/cli.py b/src/scancode/cli.py index ca4d279eb0e..1e418e32f07 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -221,16 +221,6 @@ def default_processes(): callback=validate_input_path, type=click.Path(exists=True, readable=True, path_type=str)) -@click.option('--include', - multiple=True, - default=None, - metavar='', - help='Include files matching .', - sort_order=11, - help_group=cliutils.CORE_GROUP, - cls=PluggableCommandLineOption, -) - @click.option('--ignore', multiple=True, default=None, @@ -415,7 +405,6 @@ def default_processes(): def scancode( ctx, input, # NOQA - include, ignore, strip_root, full_root, @@ -527,7 +516,6 @@ def scancode( # run proper success, _results = run_scan( input=input, - include=include, ignore=ignore, from_json=from_json, strip_root=strip_root, @@ -570,7 +558,6 @@ def scancode( def run_scan( input, # - include=[], ignore=[], from_json=False, strip_root=False, @@ -623,6 +610,9 @@ def echo_func(*_args, **_kwargs): msg = 'At least one input path is required.' raise ScancodeError(msg) + # To support multiple path inputs + include = [] + if not isinstance(input, (list, tuple)): if not isinstance(input, str): msg = 'Unknown format: "{}".'.format(repr(input)) @@ -637,8 +627,6 @@ def echo_func(*_args, **_kwargs): # VirtualCodebase; otherwise we have to process `input` to make it a single # root with excludes. elif not from_json: - # FIXME: support the multiple root better. This is quirky at best - # This is the case where we have a list of input path and the # `from_json` option is not selected: we can handle this IFF they share # a common root directory and none is an absolute path @@ -650,30 +638,33 @@ def echo_func(*_args, **_kwargs): ) raise ScancodeError(msg) + abs_input = [os.path.abspath(i) for i in input] + # find the common prefix directory (note that this is a pre string # operation hence it may return non-existing paths - common_prefix = os.path.commonprefix(input) + common_prefix = os.path.commonprefix(abs_input) if not common_prefix: # we have no common prefix, but all relative. therefore the - # parent/root is the current ddirectory + # parent/root is the current directory common_prefix = str('.') + elif not common_prefix.endswith("/"): + # common prefix has trailing incomplete dirname + # for example the common prefix of "/temp/scancode" + # and "/temp/scans" is "/temp/scan" + common_prefix, _, _ = common_prefix.rpartition("/") elif not os.path.isdir(common_prefix): msg = ( 'Invalid inputs: all input paths must share a ' - 'common single parent directory.' + f'common single parent directory. common part: {common_prefix}' ) raise ScancodeError(msg) - # and we craft a list of synthetic --include path pattern options from - # the input list of paths - included_paths = [as_posixpath(path).rstrip('/') for path in input] - include.extend(included_paths) - - # ... and use the common prefix as our new input - # FIXME: we should not walk outside inputs + # and we craft a list of include paths where the codebase walks + # will start from, even though the root is the common prefix + include = [as_posixpath(path).rstrip('/') for path in abs_input] input = common_prefix # NOQA # build mappings of all options to pass down to plugins diff --git a/tests/commoncode/test_resource.py b/tests/commoncode/test_resource.py index 6249ebb435f..4b36bd92380 100644 --- a/tests/commoncode/test_resource.py +++ b/tests/commoncode/test_resource.py @@ -353,7 +353,7 @@ def test_get_resource_for_multiple_resource_codebase(self): codebase = Codebase(test_codebase) assert codebase.get_resource("resource/a").path == "resource/a" - assert codebase.get_resource("/resource/c").path == "resource/c" + assert codebase.get_resource("resource/c").path == "resource/c" assert codebase.get_resource("resource/dsasda/../b/").path == "resource/b" def test_Resource_build_path(self): diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 855b7c7959d..2b917909f37 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -144,7 +144,6 @@ Options: -n, --processes INT Set the number of parallel processes to use. Disable parallel processing if 0. Also disable threading if -1. [default: (number of CPUs)-1] - --include Include files matching . -q, --quiet Do not print summary or progress. -v, --verbose Print progress as file-by-file path instead of a progress bar. Print verbose scan counters. diff --git a/tests/scancode/test_cli.py b/tests/scancode/test_cli.py index 9d038f71e62..0dca907efcf 100644 --- a/tests/scancode/test_cli.py +++ b/tests/scancode/test_cli.py @@ -168,7 +168,7 @@ def test_scan_info_returns_full_root(): file_paths = [f['path'] for f in result_data['files']] assert len(file_paths) == 12 # note that we strip paths from leading and trailing slashes - root = fileutils.as_posixpath(test_dir).strip('/') + root = fileutils.as_posixpath(test_dir) assert all(p.startswith(root) for p in file_paths) @@ -184,7 +184,7 @@ def test_scan_info_returns_correct_full_root_with_single_file(): scanned_file = files[0] # and we check that the path is the full path without repeating the file name # note that the path never contain leading and trailing slashes - assert scanned_file['path'] == fileutils.as_posixpath(test_file).strip('/') + assert scanned_file['path'] == fileutils.as_posixpath(test_file) def test_scan_info_returns_does_not_strip_root_with_single_file(): @@ -837,6 +837,15 @@ def test_scan_should_not_fail_with_low_max_in_memory_setting_when_ignoring_files run_scan_click(args, expected_rc=0) +def test_scan_supports_multiple_input_paths(): + test_file_1 = test_env.get_test_loc('summaries/client', relative=True) + test_file_2 = test_env.get_test_loc('summaries/counts', relative=True) + result_file = test_env.get_temp_file('json') + args = ['--info', '-n', '1', test_file_1, test_file_2, '--json', result_file] + run_scan_click(args, expected_rc=0) + + + def test_get_displayable_summary(): from scancode.cli import get_displayable_summary from commoncode.resource import Codebase From 43c5acfce6a73ee2864b5d5c023355f5a1fd8eef Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:18:40 +0530 Subject: [PATCH 03/10] Bump commoncode version for release Signed-off-by: Ayan Sinha Mahapatra --- pyproject-commoncode.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 6c69ab439b3..10ab2d8631c 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "commoncode" -version = "32.4.2" +version = "32.5.0" authors = [ { name = "nexB. Inc. and others", email = "info@aboutcode.org" }, ] From 0445da5973b00dc7b05f4150b98d61f342028430 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:37:51 +0530 Subject: [PATCH 04/10] Fix commoncode release scripts and bump version Signed-off-by: Ayan Sinha Mahapatra --- .github/workflows/commoncode-release.yml | 2 +- .../licensedcode-data-index-release.yml | 2 +- commoncode-CHANGELOG.rst | 19 +++++++++++++++++++ pyproject-commoncode.toml | 5 ++--- 4 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/workflows/commoncode-release.yml b/.github/workflows/commoncode-release.yml index 467e6ac234b..ca127d29656 100644 --- a/.github/workflows/commoncode-release.yml +++ b/.github/workflows/commoncode-release.yml @@ -1,4 +1,4 @@ -name: Create library release archives, create a GH release and publish PyPI wheel and sdist on tag in main branch +name: Create and release commoncode wheels on GitHub and Pypi # This is executed automatically on a tag in the main branch diff --git a/.github/workflows/licensedcode-data-index-release.yml b/.github/workflows/licensedcode-data-index-release.yml index ba267f89f5f..353829d1905 100644 --- a/.github/workflows/licensedcode-data-index-release.yml +++ b/.github/workflows/licensedcode-data-index-release.yml @@ -1,4 +1,4 @@ -name: Create library release archives, create a GH release and publish PyPI wheel and sdist on tag in main branch +name: Create and release licensedcode index & data wheels on GitHub and Pypi # This is executed automatically on a tag in the main branch diff --git a/commoncode-CHANGELOG.rst b/commoncode-CHANGELOG.rst index dc63866360b..7e3a2fb344f 100644 --- a/commoncode-CHANGELOG.rst +++ b/commoncode-CHANGELOG.rst @@ -1,6 +1,25 @@ Release notes ============= + +Version 32.5.1 - (2026-06-11) +----------------------------- + +- Minor fix in pyproject.toml to release wheels + to pypi properly. + +Version 32.5.0 - (2026-06-11) +----------------------------- + +- Merge commoncode back into scancode-toolkit + https://github.com/aboutcode-org/scancode-toolkit/pull/5116 + +- Add support to create codebase from multiple input paths by + starting codebase walk from these inputs and then ignoring + based on path patterns. Improves codebase and resource + collection and creation performance for multi-path scan inputs + https://github.com/aboutcode-org/scancode-toolkit/pull/5055 + Version 32.4.2 - (2025-01-08) ----------------------------- diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 10ab2d8631c..92adbe61f1f 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -42,9 +42,6 @@ metadata_files = [ requires-python = ">=3.10" -[project.urls] -Homepage = "https://github.com/nexB/scancode-toolkit" - dependencies = [ "attrs >= 18.1,!=20.1.0;python_version<'3.11'", "attrs >= 22.1.0;python_version>='3.11'", @@ -55,6 +52,8 @@ dependencies = [ "text_unidecode >= 1.0" ] +[project.urls] +Homepage = "https://github.com/nexB/scancode-toolkit" [project.optional-dependencies] dev = [ From 72dd3d8d1a6e6b4dc17f04b2d589e36a95ea1cdf Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 15:55:19 +0530 Subject: [PATCH 05/10] Bump version for commoncode v32.5.2 Signed-off-by: Ayan Sinha Mahapatra --- commoncode-CHANGELOG.rst | 4 ++++ pyproject-commoncode.toml | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/commoncode-CHANGELOG.rst b/commoncode-CHANGELOG.rst index 7e3a2fb344f..2d56e748149 100644 --- a/commoncode-CHANGELOG.rst +++ b/commoncode-CHANGELOG.rst @@ -1,6 +1,10 @@ Release notes ============= +Version 32.5.2 - (2026-06-11) +----------------------------- + +- Bump version properly. Version 32.5.1 - (2026-06-11) ----------------------------- diff --git a/pyproject-commoncode.toml b/pyproject-commoncode.toml index 92adbe61f1f..3ea4920d29c 100644 --- a/pyproject-commoncode.toml +++ b/pyproject-commoncode.toml @@ -4,7 +4,7 @@ build-backend = "flot.buildapi" [project] name = "commoncode" -version = "32.5.0" +version = "32.5.2" authors = [ { name = "nexB. Inc. and others", email = "info@aboutcode.org" }, ] From bb148eadad2ea9eb0c60a4ef8759675365b8c06f Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Thu, 11 Jun 2026 17:08:56 +0530 Subject: [PATCH 06/10] Fix test failures Signed-off-by: Ayan Sinha Mahapatra --- src/commoncode/resource.py | 4 ++-- src/commoncode/testcase.py | 2 +- src/scancode/outdated.py | 12 ++++++++++-- src/scancode_config.py | 2 +- tests/scancode/data/help/help.txt | 3 +-- tests/scancode/test_outdated.py | 4 ++-- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 378388a2dd3..5302f2f9d72 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -1667,8 +1667,8 @@ def strip_first_path_segment(path): '' >>> strip_first_path_segment('foo/bar/baz') 'bar/baz' - >>> strip_first_path_segment('/foo/bar/baz/') - 'bar/baz' + >>> strip_first_path_segment('/foo/bar/baz') + 'foo/bar/baz' >>> strip_first_path_segment('foo/') '' """ diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py index 4feb7c645b1..ee680e24da2 100644 --- a/src/commoncode/testcase.py +++ b/src/commoncode/testcase.py @@ -132,7 +132,7 @@ def get_test_loc(self, test_path, copy=False, debug=False, must_exist=True, rela if relative: _, _, rel_test_loc = test_loc.rpartition(os.getcwd()) - return rel_test_loc.strip("/") + return rel_test_loc.strip("/").strip("\\") return test_loc diff --git a/src/scancode/outdated.py b/src/scancode/outdated.py index 4be850d8470..2c68dc39e2e 100644 --- a/src/scancode/outdated.py +++ b/src/scancode/outdated.py @@ -83,7 +83,11 @@ def total_seconds(td): class VersionCheckState: - def __init__(self): + def __init__(self, is_test=False): + if is_test: + self.state={} + return + self.statefile_path = os.path.join( scancode_cache_dir, 'scancode-version-check.json') self.lockfile_path = self.statefile_path + '.lockfile' @@ -135,6 +139,7 @@ def check_scancode_version( release_date=scancode_release_date, new_version_url='https://pypi.org/pypi/scancode-toolkit/json', force=False, + is_test=False, ): """ Check for an updated version of scancode-toolkit. Return a message to @@ -146,6 +151,7 @@ def check_scancode_version( installed_version=installed_version, new_version_url=new_version_url, force=force, + is_test=is_test, ) if newer_version: return build_outdated_message( @@ -159,6 +165,7 @@ def fetch_newer_version( installed_version=scancode_version, new_version_url='https://pypi.org/pypi/scancode-toolkit/json', force=False, + is_test=False, ): """ Return a version string if there is an updated version of scancode-toolkit @@ -175,9 +182,10 @@ def fetch_newer_version( try: installed_version = packaging_version.parse(installed_version) - state = VersionCheckState() + state = VersionCheckState(is_test=is_test) current_time = datetime.datetime.utcnow() + latest_version = None # Determine if we need to refresh the state if ('last_check' in state.state and 'latest_version' in state.state): last_check = datetime.datetime.strptime( diff --git a/src/scancode_config.py b/src/scancode_config.py index 20c57a19bef..6e9f634b083 100644 --- a/src/scancode_config.py +++ b/src/scancode_config.py @@ -95,7 +95,7 @@ def _create_dir(location): from subprocess import CalledProcessError # this may fail with exceptions - cmd = 'git', 'describe', '--tags', + cmd = 'git', 'describe', '--tags', '--match="v*"' try: output = check_output(cmd, stderr=STDOUT) __version__ = output.decode('utf-8').strip() diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index e725888ead4..d65f1f00f45 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -92,8 +92,6 @@ Options: such that all paths have a common root directory. pre-scan: - --ignore Ignore files matching . - --include Include files matching . --facet = Add the to files with a path matching . @@ -138,6 +136,7 @@ Options: which are todo items and needs manual review. core: + --ignore Ignore files matching . --timeout Stop an unfinished file scan after a timeout in seconds. [default: 120 seconds] -n, --processes INT Set the number of parallel processes to use. Disable diff --git a/tests/scancode/test_outdated.py b/tests/scancode/test_outdated.py index cdac7853b2b..0509c6ea360 100644 --- a/tests/scancode/test_outdated.py +++ b/tests/scancode/test_outdated.py @@ -152,8 +152,8 @@ def jget(*args, **kwargs): json=jget, status_code=200 ) - assert not outdated.fetch_newer_version(force=True) - assert not outdated.check_scancode_version(force=True) + assert not outdated.fetch_newer_version(force=True, is_test=True) + assert not outdated.check_scancode_version(force=True, is_test=True) def test_fetch_newer_version_local_git_version(): From dc9d3b5672aa0ae3cb6fe19e596bfff58bed5cb9 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 22 Jun 2026 19:31:28 +0530 Subject: [PATCH 07/10] Install commoncode locally in dev installation Install commoncode from local source code instead of released commoncode so this can be installed/tested directly. Also add tests for released commoncode via azure pipelines. Signed-off-by: Ayan Sinha Mahapatra --- azure-pipelines.yml | 15 +++++++++++++++ configure | 11 ++++++++++- configure.bat | 3 +++ .../{test_plugin_ignore.py => test_ignore.py} | 0 4 files changed, 28 insertions(+), 1 deletion(-) rename tests/scancode/{test_plugin_ignore.py => test_ignore.py} (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 3ca4f7e915e..485a304063b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -183,6 +183,21 @@ jobs: venv/bin/scancode -i --verbose samples/ -n3 --json foo.json; done +################################################################################ +# Tests with released commoncode instead of local editable commoncode +################################################################################ + + - template: etc/ci/azure-posix.yml + parameters: + job_name: ubuntu_test_released_commocode + image_name: ubuntu-22.04 + python_versions: ['3.14'] + python_architecture: x64 + test_suites: + all: + venv/bin/pip uninstall commoncode && venv/bin/pip install commoncode + venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2 + ################################################################################ # Tests using a plain pip install to get the latest of all wheels diff --git a/configure b/configure index 5262999ce74..d278901062e 100755 --- a/configure +++ b/configure @@ -256,6 +256,15 @@ install_packages() { $1 } +install_packages_with_local() { + # commoncode is present as dependencies of dependencies and so + # we need to install commoncode from local source first so this + # is tested and not the released commoncode + "$CFG_BIN_DIR/flot" --pyproject pyproject-commoncode.toml + "$CFG_BIN_DIR/pip" install ./dist/commoncode*.whl + install_packages "$CFG_REQUIREMENTS" +} + ################################ cli_help() { @@ -313,7 +322,7 @@ PIP_EXTRA_ARGS="$PIP_EXTRA_ARGS" find_python create_virtualenv "$VIRTUALENV_DIR" install_packages "$FLOT_REQUIREMENTS" -install_packages "$CFG_REQUIREMENTS" +install_packages_with_local . "$CFG_BIN_DIR/activate" "$CFG_BIN_DIR/scancode-train-gibberish-model" diff --git a/configure.bat b/configure.bat index b4f61216441..0bf0b8615bb 100644 --- a/configure.bat +++ b/configure.bat @@ -162,6 +162,9 @@ if %ERRORLEVEL% neq 0 ( %PIP_EXTRA_ARGS% ^ %FLOT_REQUIREMENTS% +"%CFG_BIN_DIR%\flot" --pyproject pyproject-commoncode.toml +"%CFG_BIN_DIR%\pip" install ./dist/commoncode*.whl + "%CFG_BIN_DIR%\pip" install ^ --upgrade ^ %CFG_QUIET% ^ diff --git a/tests/scancode/test_plugin_ignore.py b/tests/scancode/test_ignore.py similarity index 100% rename from tests/scancode/test_plugin_ignore.py rename to tests/scancode/test_ignore.py From 1c04c47269e51d7706901d2e09b5634d885b0082 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 22 Jun 2026 20:53:19 +0530 Subject: [PATCH 08/10] Add new CLI option --config-file Signed-off-by: Ayan Sinha Mahapatra --- src/scancode/cli.py | 46 +++++++++++++++++++ tests/scancode/data/help/help.txt | 44 +++++++++--------- tests/scancode/data/help/help_linux.txt | 44 +++++++++--------- tests/scancode/data/plugin_ignore/ignore.yaml | 3 ++ tests/scancode/test_ignore.py | 15 ++++++ 5 files changed, 110 insertions(+), 42 deletions(-) create mode 100644 tests/scancode/data/plugin_ignore/ignore.yaml diff --git a/src/scancode/cli.py b/src/scancode/cli.py index 1e418e32f07..f7fe221c214 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -17,12 +17,14 @@ import logging import os import platform +import saneyaml import sys import traceback from collections import defaultdict from functools import partial from multiprocessing import TimeoutError +from pathlib import Path from time import sleep from time import time @@ -231,6 +233,15 @@ def default_processes(): cls=PluggableCommandLineOption, ) +@click.option('-c', '--config-file', + type=click.File('r'), + required=False, + help='Path to the configuration file.', + sort_order=11, + help_group=cliutils.CORE_GROUP, + cls=PluggableCommandLineOption, +) + @click.option('--strip-root', is_flag=True, default=False, @@ -405,6 +416,7 @@ def default_processes(): def scancode( ctx, input, # NOQA + config_file, ignore, strip_root, full_root, @@ -517,6 +529,7 @@ def scancode( success, _results = run_scan( input=input, ignore=ignore, + config_file=config_file, from_json=from_json, strip_root=strip_root, full_root=full_root, @@ -558,6 +571,7 @@ def scancode( def run_scan( input, # + config_file=None, ignore=[], from_json=False, strip_root=False, @@ -667,6 +681,10 @@ def echo_func(*_args, **_kwargs): include = [as_posixpath(path).rstrip('/') for path in abs_input] input = common_prefix # NOQA + config_ignores = load_configuration_file(config_file) + if config_ignores: + ignore = ignore + tuple(config_ignores) + # build mappings of all options to pass down to plugins standard_options = dict( input=input, @@ -1108,6 +1126,34 @@ def echo_func(*_args, **_kwargs): return success, results +def load_configuration_file(path): + """ + Load scancode configuration values from a file at `path`. + + Currently only supports ignore path patterns specified with + "ignored_patterns". This should be compatible with scancode.io + configuration values whenever possible: + https://scancodeio.readthedocs.io/en/latest/project-configuration.html + """ + ignores = [] + if not path: + return ignores + + click.echo(f"Loading env from {path}") + try: + + config_values = saneyaml.load(path.read()) + ignores = config_values.get("ignored_patterns", []) + except (saneyaml.YAMLError, Exception): + msg = ( + f'Failed to load configuration from "{path}". ' + f"The file format is invalid." + ) + raise ScancodeError(msg + '\n' + traceback.format_exc()) + + return ignores + + def run_codebase_plugins( stage, plugins, diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index d65f1f00f45..52bc2e0ce73 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -136,27 +136,29 @@ Options: which are todo items and needs manual review. core: - --ignore Ignore files matching . - --timeout Stop an unfinished file scan after a timeout in - seconds. [default: 120 seconds] - -n, --processes INT Set the number of parallel processes to use. Disable - parallel processing if 0. Also disable threading if - -1. [default: (number of CPUs)-1] - -q, --quiet Do not print summary or progress. - -v, --verbose Print progress as file-by-file path instead of a - progress bar. Print verbose scan counters. - --from-json Load codebase from one or more JSON scan - file(s). - --max-in-memory INTEGER Maximum number of files and directories scan details - kept in memory during a scan. Additional files and - directories scan details above this number are cached - on-disk rather than in memory. Use 0 to use unlimited - memory and disable on-disk caching. Use -1 to use - only on-disk caching. [default: 10000] - --max-depth INTEGER Maximum nesting depth of subdirectories to scan. - Descend at most INTEGER levels of directories below - and including the starting directory. Use 0 for no - scan depth limit. + --ignore Ignore files matching . + --timeout Stop an unfinished file scan after a timeout in + seconds. [default: 120 seconds] + -n, --processes INT Set the number of parallel processes to use. + Disable parallel processing if 0. Also disable + threading if -1. [default: (number of CPUs)-1] + -c, --config-file FILENAME Path to the configuration file. + -q, --quiet Do not print summary or progress. + -v, --verbose Print progress as file-by-file path instead of a + progress bar. Print verbose scan counters. + --from-json Load codebase from one or more JSON scan + file(s). + --max-in-memory INTEGER Maximum number of files and directories scan + details kept in memory during a scan. Additional + files and directories scan details above this + number are cached on-disk rather than in memory. + Use 0 to use unlimited memory and disable on-disk + caching. Use -1 to use only on-disk caching. + [default: 10000] + --max-depth INTEGER Maximum nesting depth of subdirectories to scan. + Descend at most INTEGER levels of directories + below and including the starting directory. Use 0 + for no scan depth limit. documentation: -h, --help Show this message and exit. diff --git a/tests/scancode/data/help/help_linux.txt b/tests/scancode/data/help/help_linux.txt index 2b917909f37..9630f39fb01 100644 --- a/tests/scancode/data/help/help_linux.txt +++ b/tests/scancode/data/help/help_linux.txt @@ -138,27 +138,29 @@ Options: which are todo items and needs manual review. core: - --ignore Ignore files matching . - --timeout Stop an unfinished file scan after a timeout in - seconds. [default: 120 seconds] - -n, --processes INT Set the number of parallel processes to use. Disable - parallel processing if 0. Also disable threading if - -1. [default: (number of CPUs)-1] - -q, --quiet Do not print summary or progress. - -v, --verbose Print progress as file-by-file path instead of a - progress bar. Print verbose scan counters. - --from-json Load codebase from one or more JSON scan - file(s). - --max-in-memory INTEGER Maximum number of files and directories scan details - kept in memory during a scan. Additional files and - directories scan details above this number are cached - on-disk rather than in memory. Use 0 to use unlimited - memory and disable on-disk caching. Use -1 to use - only on-disk caching. [default: 10000] - --max-depth INTEGER Maximum nesting depth of subdirectories to scan. - Descend at most INTEGER levels of directories below - and including the starting directory. Use 0 for no - scan depth limit. + --ignore Ignore files matching . + --timeout Stop an unfinished file scan after a timeout in + seconds. [default: 120 seconds] + -n, --processes INT Set the number of parallel processes to use. + Disable parallel processing if 0. Also disable + threading if -1. [default: (number of CPUs)-1] + -c, --config-file FILENAME Path to the configuration file. + -q, --quiet Do not print summary or progress. + -v, --verbose Print progress as file-by-file path instead of a + progress bar. Print verbose scan counters. + --from-json Load codebase from one or more JSON scan + file(s). + --max-in-memory INTEGER Maximum number of files and directories scan + details kept in memory during a scan. Additional + files and directories scan details above this + number are cached on-disk rather than in memory. + Use 0 to use unlimited memory and disable on-disk + caching. Use -1 to use only on-disk caching. + [default: 10000] + --max-depth INTEGER Maximum nesting depth of subdirectories to scan. + Descend at most INTEGER levels of directories + below and including the starting directory. Use 0 + for no scan depth limit. documentation: -h, --help Show this message and exit. diff --git a/tests/scancode/data/plugin_ignore/ignore.yaml b/tests/scancode/data/plugin_ignore/ignore.yaml new file mode 100644 index 00000000000..fc52a109624 --- /dev/null +++ b/tests/scancode/data/plugin_ignore/ignore.yaml @@ -0,0 +1,3 @@ +ignored_patterns: + - '*.doc' + - '*/test*' diff --git a/tests/scancode/test_ignore.py b/tests/scancode/test_ignore.py index db739db88a7..2a6bf1ef14c 100644 --- a/tests/scancode/test_ignore.py +++ b/tests/scancode/test_ignore.py @@ -220,6 +220,21 @@ def test_scancode_multiple_ignores(self): scan_locs = [x['path'] for x in scan_result['files']] assert scan_locs == [u'user', u'user/src', u'user/src/test'] + def test_scancode_ignore_files_from_config(self): + test_dir = self.extract_test_tar('plugin_ignore/user.tgz') + config_file = self.get_test_loc('plugin_ignore/ignore.yaml') + result_file = self.get_temp_file('json') + args = ['--copyright', '--strip-root', '--config-file', config_file, test_dir, '--json', result_file] + run_scan_click(args) + scan_result = load_json_result(result_file) + assert scan_result['headers'][0]['extra_data']['files_count'] == 0 + scan_locs = [x['path'] for x in scan_result['files']] + expected = [ + u'user', + u'user/src', + ] + assert scan_locs == expected + def test_scancode_codebase_attempt_to_access_an_ignored_resourced_cached_to_disk(self): test_dir = self.extract_test_tar('plugin_ignore/user.tgz') result_file = self.get_temp_file('json') From 815e454caed896bdd6b3fb65ea54f8067ab5bf23 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 22 Jun 2026 20:53:38 +0530 Subject: [PATCH 09/10] Update ignore/config documentation Signed-off-by: Ayan Sinha Mahapatra --- azure-pipelines.yml | 3 +- .../scancode-cli/cli-core-options.rst | 95 ++++++++++++ .../scancode-cli/cli-help-text-options.rst | 68 ++++++--- .../scancode-cli/cli-post-scan-options.rst | 50 +++++++ .../scancode-cli/cli-pre-scan-options.rst | 140 ------------------ docs/source/rst-snippets/cli-core-options.rst | 49 +++--- .../rst-snippets/cli-pre-scan-options.rst | 4 - tests/licensedcode/test_detect.py | 4 +- .../{test_ignore.py => test_plugin_ignore.py} | 0 9 files changed, 221 insertions(+), 192 deletions(-) rename tests/scancode/{test_ignore.py => test_plugin_ignore.py} (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 485a304063b..b0abbc0ec64 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -195,8 +195,7 @@ jobs: python_architecture: x64 test_suites: all: - venv/bin/pip uninstall commoncode && venv/bin/pip install commoncode - venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2 + venv/bin/pip uninstall -y commoncode && venv/bin/pip install commoncode && venv/bin/pytest -n 2 -vvs tests/scancode/test_cli.py --reruns 2 ################################################################################ diff --git a/docs/source/reference/scancode-cli/cli-core-options.rst b/docs/source/reference/scancode-cli/cli-core-options.rst index e945d245da0..e89374f1a4f 100644 --- a/docs/source/reference/scancode-cli/cli-core-options.rst +++ b/docs/source/reference/scancode-cli/cli-core-options.rst @@ -145,3 +145,98 @@ Comparing progress message options This would scan the file ``samples/levelone/leveltwo/file`` but ignore ``samples/levelone/leveltwo/levelthree/file`` + +---- + +.. _cli-ignore-option: + +``--ignore `` +---------------------- + + In a scan, all files inside the directory specified as an input argument is scanned. But if + there are some files which you don't want to scan, the ``--ignore`` option can be used to do + the same. + + **Example** + + .. code-block:: shell + + scancode --ignore "*.java" samples samples.json + + Here, ScanCode ignores files ending with `.java`, and continues with other files as usual. + + More information on :ref:`glob-pattern-matching`. + +---- + +.. _cli-config-option: + +``--config-file `` +------------------------ + + Path patterns which should be ignored in the scan can also be provided + through a configuration file. + + **Example** + + .. code-block:: shell + + scancode --config-file scancode-config.yaml samples samples.json + + .. code-block:: yaml + + ignored_patterns: + - '*.java' + - '*/licenses/*' + + Here, ScanCode ignores files ending with `.java` and the `licenses` directory, + and continues with other files as usual. + + This is also compatible with the `scancode.io configuration file `_. + +---- + +.. _glob-pattern-matching: + +Glob Pattern Matching +--------------------- + + All the pre-scan options use pattern matching, so the basics of Glob Pattern Matching is + discussed briefly below. + + Glob pattern matching is useful for matching a group of files, by using patterns in their + names. Then using these patterns, files are grouped and treated differently as required. + + Here are some rules from the `Linux Manual `_ + on glob patterns. Refer the same for more detailed information. + + A string is a wildcard pattern if it contains one of the characters '?', '*' or '['. Globbing + is the operation that expands a wildcard pattern into the list of pathnames matching the + pattern. Matching is defined by: + + - A '?' (not between brackets) matches any single character. + + - A '*' (not between brackets) matches any string, including the empty string. + + - An expression "[...]" where the first character after the leading '[' is not an '!' matches a + single character, namely any of the characters enclosed by the brackets. + + - There is one special convention: two characters separated by '-' denote a range. + + - An expression "[!...]" matches a single character, namely any character that is not matched + by the expression obtained by removing the first '!' from it. + + - A '/' in a pathname cannot be matched by a '?' or '*' wildcard, or by a range like "[.-0]". + + Note that wildcard patterns are not regular expressions, although they are a bit similar. + + For more information on glob pattern matching refer these resources: + + - `Linux Manual `_ + - `Wildcard Match Documentation `_. + + You can also import these Python Libraries to practice UNIX style pattern matching: + + - `fnmatch `_ for File Name matching + - `glob `_ for File Path matching + diff --git a/docs/source/reference/scancode-cli/cli-help-text-options.rst b/docs/source/reference/scancode-cli/cli-help-text-options.rst index a3bfb44f776..bd857273874 100644 --- a/docs/source/reference/scancode-cli/cli-help-text-options.rst +++ b/docs/source/reference/scancode-cli/cli-help-text-options.rst @@ -125,8 +125,6 @@ The following help text is displayed for ScanCode version 32.0.0: such that all paths have a common root directory. pre-scan: - --ignore Ignore files matching . - --include Include files matching . --classify Classify files with flags indicating whether the file is a legal, readme, test or similar file. --facet = Add the to files with a path matching @@ -169,11 +167,13 @@ The following help text is displayed for ScanCode version 32.0.0: at the file and directory level. core: + --ignore Ignore files matching . --timeout Stop an unfinished file scan after a timeout in seconds. [default: 120 seconds] -n, --processes INT Set the number of parallel processes to use. Disable parallel processing if 0. Also disable threading if -1. [default: (number of CPUs)-1] + -c, --config-file FILENAME Path to the configuration file. -q, --quiet Do not print summary or progress. -v, --verbose Print progress as file-by-file path instead of a progress bar. Print verbose scan counters. @@ -512,7 +512,7 @@ for ScanCode Version 32.0.0. -------------------------------------------- Plugin: scancode_post_scan:classify class: summarycode.classify_plugin:FileClassifier codebase_attributes: - resource_attributes: is_legal, is_manifest, is_readme, is_top_level, is_key_file + resource_attributes: is_legal, is_manifest, is_readme, is_top_level, is_key_file, is_community sort_order: 4 required_plugins: options: @@ -690,6 +690,19 @@ for ScanCode Version 32.0.0. - packages + -------------------------------------------- + Plugin: scancode_post_scan:todo class: summarycode.todo:AmbiguousDetectionsToDoPlugin + codebase_attributes: todo + resource_attributes: for_todo + sort_order: 3 + required_plugins: + options: + help_group: post-scan, name: todo: --todo + help: Summarize scans by providing all ambiguous detections which are todo items and needs manual review. + doc: + Summarize a scan by compiling review items of ambiguous detections. + + -------------------------------------------- Plugin: scancode_pre_scan:facet class: summarycode.facet:AddFacet codebase_attributes: @@ -705,21 +718,6 @@ for ScanCode Version 32.0.0. test vs. data, etc. - -------------------------------------------- - Plugin: scancode_pre_scan:ignore class: scancode.plugin_ignore:ProcessIgnore - codebase_attributes: - resource_attributes: - sort_order: 100 - required_plugins: - options: - help_group: pre-scan, name: ignore: --ignore - help: Ignore files matching . - help_group: pre-scan, name: include: --include - help: Include files matching . - doc: - Include or ignore files matching patterns. - - -------------------------------------------- Plugin: scancode_scan:copyrights class: cluecode.plugin_copyright:CopyrightScanner codebase_attributes: @@ -761,10 +759,23 @@ for ScanCode Version 32.0.0. Tag a file as generated. + -------------------------------------------- + Plugin: scancode_scan:go_symbol class: go_inspector.plugin:GoSymbolScannerPlugin + codebase_attributes: + resource_attributes: go_symbols + sort_order: 100 + required_plugins: + options: + help_group: primary scans, name: go_symbol: --go-symbol + help: Collect Go symbols. + doc: + Scan a Go binary for symbols using GoReSym. + + -------------------------------------------- Plugin: scancode_scan:info class: scancode.plugin_info:InfoScanner codebase_attributes: - resource_attributes: date, sha1, md5, sha256, mime_type, file_type, programming_language, is_binary, is_text, is_archive, is_media, is_source, is_script + resource_attributes: date, sha1, md5, sha256, sha1_git, mime_type, file_type, programming_language, is_binary, is_text, is_archive, is_media, is_source, is_script sort_order: 0 required_plugins: options: @@ -779,7 +790,7 @@ for ScanCode Version 32.0.0. Plugin: scancode_scan:licenses class: licensedcode.plugin_license:LicenseScanner codebase_attributes: license_detections resource_attributes: detected_license_expression, detected_license_expression_spdx, license_detections, license_clues, percentage_of_license_text - sort_order: 4 + sort_order: 5 required_plugins: options: help_group: primary scans, name: license: -l, --license @@ -804,13 +815,15 @@ for ScanCode Version 32.0.0. Plugin: scancode_scan:packages class: packagedcode.plugin_package:PackageScanner codebase_attributes: packages, dependencies resource_attributes: package_data, for_packages - sort_order: 3 + sort_order: 4 required_plugins: scan:licenses options: help_group: primary scans, name: package: -p, --package help: Scan for application package and dependency manifests, lockfiles and related data. help_group: primary scans, name: system_package: --system-package help: Scan for installed system package databases. + help_group: primary scans, name: package_in_compiled: --package-in-compiled + help: Scan for package and dependency related data in compiled binaries. Currently supported compiled binaries: Go, Rust. help_group: primary scans, name: package_only: --package-only help: Scan for system and application package data and skip license/copyright detection and top-level package creation. help_group: documentation, name: list_packages: --list-packages @@ -821,6 +834,19 @@ for ScanCode Version 32.0.0. level. + -------------------------------------------- + Plugin: scancode_scan:rust_symbol class: rust_inspector.plugin:RustSymbolScannerPlugin + codebase_attributes: + resource_attributes: rust_symbols + sort_order: 100 + required_plugins: + options: + help_group: primary scans, name: rust_symbol: --rust-symbol + help: Collect Rust symbols from rust binaries. + doc: + Scan a Rust binary for symbols using blint, lief and symbolic. + + -------------------------------------------- Plugin: scancode_scan:urls class: cluecode.plugin_url:UrlScanner codebase_attributes: diff --git a/docs/source/reference/scancode-cli/cli-post-scan-options.rst b/docs/source/reference/scancode-cli/cli-post-scan-options.rst index 690b4d27a22..e3a9a628a64 100644 --- a/docs/source/reference/scancode-cli/cli-post-scan-options.rst +++ b/docs/source/reference/scancode-cli/cli-post-scan-options.rst @@ -17,6 +17,56 @@ To see all plugins available via command line help, use ``--plugins``. ---- +.. _cli-classify-option: + +``--classify`` +-------------- + + .. admonition:: Sub-option + + The options ``--license-clarity-score`` and ``--tallies-key-files`` are sub-options of + ``--classify``. ``--license-clarity-score`` and ``--tallies-key-files`` are Post-Scan + Options. + + **Example** + + .. code-block:: shell + + scancode -clpieu --json-pp sample_facet.json samples --classify + + This option makes ScanCode further classify scanned files/directories, to determine whether they + fall in these following categories + + - legal + - readme + - top-level + - manifest + + A manifest file in computing is a file containing metadata for a group of accompanying + files that are part of a set or coherent unit. + + - key-file + + A KEY file serves as a keystone element, containing essential + information about a software package — such as its dependencies, + versioning, licensing, and more. It often contains the + ``primary-license`` or the overall license of the package, among + other package metadata which are general or ecosystem specific. + + As in, to the JSON object of each file scanned, these extra attributes are added. + + .. code-block:: json + + { + "is_legal": false, + "is_manifest": false, + "is_readme": true, + "is_top_level": true, + "is_key_file": true + } + +---- + .. _cli-mark-source-option: ``--mark-source`` diff --git a/docs/source/reference/scancode-cli/cli-pre-scan-options.rst b/docs/source/reference/scancode-cli/cli-pre-scan-options.rst index 45379bf94ac..ad19570b211 100644 --- a/docs/source/reference/scancode-cli/cli-pre-scan-options.rst +++ b/docs/source/reference/scancode-cli/cli-pre-scan-options.rst @@ -11,99 +11,6 @@ Quick reference ---- -.. _cli-ignore-option: - -``--ignore `` ----------------------- - - In a scan, all files inside the directory specified as an input argument is scanned. But if - there are some files which you don't want to scan, the ``--ignore`` option can be used to do - the same. - - **Example** - - .. code-block:: shell - - scancode --ignore "*.java" samples samples.json - - Here, ScanCode ignores files ending with `.java`, and continues with other files as usual. - - More information on :ref:`glob-pattern-matching`. - ----- - -.. _cli-include-option: - -``--include `` ------------------------ - - In a normal scan, all files inside the directory specified as an input argument is scanned. But - if you want to run the scan on only some selective files, then ``--include`` option can be used - to do the same. - - **Example** - - .. code-block:: shell - - scancode --include "*.java" samples samples.json - - Here, ScanCode selectively scans files that has names ending with `.java`, and ignores all other files. This - is basically complementary in behavior to the ``--ignore`` option. - - See also :ref:`glob-pattern-matching`. - ----- - -.. _cli-classify-option: - -``--classify`` --------------- - - .. admonition:: Sub-option - - The options ``--license-clarity-score`` and ``--tallies-key-files`` are sub-options of - ``--classify``. ``--license-clarity-score`` and ``--tallies-key-files`` are Post-Scan - Options. - - **Example** - - .. code-block:: shell - - scancode -clpieu --json-pp sample_facet.json samples --classify - - This option makes ScanCode further classify scanned files/directories, to determine whether they - fall in these following categories - - - legal - - readme - - top-level - - manifest - - A manifest file in computing is a file containing metadata for a group of accompanying - files that are part of a set or coherent unit. - - - key-file - - A KEY file serves as a keystone element, containing essential - information about a software package — such as its dependencies, - versioning, licensing, and more. It often contains the - ``primary-license`` or the overall license of the package, among - other package metadata which are general or ecosystem specific. - - As in, to the JSON object of each file scanned, these extra attributes are added. - - .. code-block:: json - - { - "is_legal": false, - "is_manifest": false, - "is_readme": true, - "is_top_level": true, - "is_key_file": true - } - ----- - .. _cli-facet-option: ``--facet =`` @@ -154,50 +61,3 @@ Quick reference multiple facets, this whole part is repeated, including the ``--facet`` option. See :ref:`facets` to learn more about what a facet is. - ----- - -.. _glob-pattern-matching: - -Glob Pattern Matching ---------------------- - - All the pre-scan options use pattern matching, so the basics of Glob Pattern Matching is - discussed briefly below. - - Glob pattern matching is useful for matching a group of files, by using patterns in their - names. Then using these patterns, files are grouped and treated differently as required. - - Here are some rules from the `Linux Manual `_ - on glob patterns. Refer the same for more detailed information. - - A string is a wildcard pattern if it contains one of the characters '?', '*' or '['. Globbing - is the operation that expands a wildcard pattern into the list of pathnames matching the - pattern. Matching is defined by: - - - A '?' (not between brackets) matches any single character. - - - A '*' (not between brackets) matches any string, including the empty string. - - - An expression "[...]" where the first character after the leading '[' is not an '!' matches a - single character, namely any of the characters enclosed by the brackets. - - - There is one special convention: two characters separated by '-' denote a range. - - - An expression "[!...]" matches a single character, namely any character that is not matched - by the expression obtained by removing the first '!' from it. - - - A '/' in a pathname cannot be matched by a '?' or '*' wildcard, or by a range like "[.-0]". - - Note that wildcard patterns are not regular expressions, although they are a bit similar. - - For more information on glob pattern matching refer these resources: - - - `Linux Manual `_ - - `Wildcard Match Documentation `_. - - You can also import these Python Libraries to practice UNIX style pattern matching: - - - `fnmatch `_ for File Name matching - - `glob `_ for File Path matching - diff --git a/docs/source/rst-snippets/cli-core-options.rst b/docs/source/rst-snippets/cli-core-options.rst index 8bf86b167af..0f589b2c6c0 100644 --- a/docs/source/rst-snippets/cli-core-options.rst +++ b/docs/source/rst-snippets/cli-core-options.rst @@ -1,37 +1,40 @@ **Core options** ---------------- --n, --processes INTEGER Scan ```` using n parallel processes. +--ignore Ignore files matching ````. - Default: ``(number of CPUs)-1`` +-n, --processes INTEGER Scan ```` using n parallel processes. --v, --verbose Print verbose file-by-file progress messages. + Default: ``(number of CPUs)-1`` --q, --quiet Do not print summary or progress messages. +-c, --config-file FILENAME Path to the configuration file. +-v, --verbose Print verbose file-by-file progress messages. ---timeout FLOAT Stop scanning a file if scanning takes longer - than a timeout in seconds. +-q, --quiet Do not print summary or progress messages. - Default: ``120`` +--timeout FLOAT Stop scanning a file if scanning takes longer + than a timeout in seconds. ---from-json Load codebase from one or more existing JSON scans to: + Default: ``120`` - - apply post-scan options to do additional processing - of scan results - - merge multiple JSON scans into one. +--from-json Load codebase from one or more existing JSON scans to: ---max-in-memory INTEGER Maximum number of files and directories scan - details kept in memory during a scan. - Additional files and directories scan details - above this number are cached on-disk rather - than in memory. Use 0 to use unlimited memory - and disable on-disk caching. Use -1 to use - only on-disk caching. + - apply post-scan options to do additional processing + of scan results + - merge multiple JSON scans into one. - Default: ``10000`` +--max-in-memory INTEGER Maximum number of files and directories scan + details kept in memory during a scan. + Additional files and directories scan details + above this number are cached on-disk rather + than in memory. Use 0 to use unlimited memory + and disable on-disk caching. Use -1 to use + only on-disk caching. ---max-depth INTEGER Descend at most INTEGER levels of directories - including and below the starting point. INTEGER - must be positive or zero for no limit. + Default: ``10000`` - Default: ``0`` +--max-depth INTEGER Descend at most INTEGER levels of directories + including and below the starting point. INTEGER + must be positive or zero for no limit. + + Default: ``0`` diff --git a/docs/source/rst-snippets/cli-pre-scan-options.rst b/docs/source/rst-snippets/cli-pre-scan-options.rst index b880e92abc3..f7361bb1b62 100644 --- a/docs/source/rst-snippets/cli-pre-scan-options.rst +++ b/docs/source/rst-snippets/cli-pre-scan-options.rst @@ -1,10 +1,6 @@ **Pre-scan options** -------------------- ---ignore Ignore files matching ````. - ---include Include files matching ````. - --facet Here ```` represents ``=``. Add the ```` to files with a path matching ````. diff --git a/tests/licensedcode/test_detect.py b/tests/licensedcode/test_detect.py index 5dbf7b369bf..7459553f2cc 100644 --- a/tests/licensedcode/test_detect.py +++ b/tests/licensedcode/test_detect.py @@ -1075,8 +1075,8 @@ def test_match_has_correct_line_positions_in_automake_perl_file(self): expected = [ # detected, match.lines(), match.qspan, ('gpl-2.0-plus', (12, 25), Span(51, 160)), - ('fsf-unlimited-no-warranty', (231, 238), Span(986, 1049)), - ('warranty-disclaimer', (306, 307), Span(1359, 1381)), + ('fsf-unlimited-no-warranty', (231, 238), Span(998, 1061) ), + ('warranty-disclaimer', (306, 307), Span(1371, 1393)), ] self.check_position('positions/automake.pl', expected) diff --git a/tests/scancode/test_ignore.py b/tests/scancode/test_plugin_ignore.py similarity index 100% rename from tests/scancode/test_ignore.py rename to tests/scancode/test_plugin_ignore.py From e901eabd5205bebee35a31b6d938a6091e4578d6 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 23 Jun 2026 18:12:48 +0530 Subject: [PATCH 10/10] Fix multiple path bug in windows Signed-off-by: Ayan Sinha Mahapatra --- src/commoncode/resource.py | 19 +- src/commoncode/system.py | 9 + src/commoncode/testcase.py | 11 +- .../summaries/multiple-input-expected.json | 1278 +++++++++++++++++ tests/scancode/test_cli.py | 6 +- 5 files changed, 1306 insertions(+), 17 deletions(-) create mode 100644 tests/scancode/data/summaries/multiple-input-expected.json diff --git a/src/commoncode/resource.py b/src/commoncode/resource.py index 5302f2f9d72..70e549cc924 100644 --- a/src/commoncode/resource.py +++ b/src/commoncode/resource.py @@ -49,6 +49,7 @@ from commoncode.fileutils import file_name from commoncode.fileutils import parent_directory from commoncode.fileutils import splitext_name +from commoncode.system import to_os_native_path """ This module provides Codebase and Resource objects as an abstraction for files @@ -64,7 +65,7 @@ # Tracing flags TRACE = False -TRACE_DEEP = False +TRACE_DEEP = True def logger_debug(*args): @@ -586,7 +587,7 @@ def _create_resources_from_root(self, root, includes, ignores): # track resources parents by location during construction. # NOTE: this cannot exhaust memory on a large codebase, because we do # not keep parents already walked and we walk topdown. - parents_by_loc = {root.location: root} + parents_by_loc = {to_os_native_path(root.location): root} def err(_error): """os.walk error handler""" @@ -613,7 +614,10 @@ def err(_error): includes=includes, ): if not created.is_file: - parents_by_loc[created.location] = created + parents_by_loc[to_os_native_path(created.location)] = created + + if TRACE_DEEP: + logger_debug(f"parents_by_loc: {parents_by_loc}") # we start walking through all the input locations for included_location in includes: @@ -624,7 +628,12 @@ def err(_error): max_depth=self.max_depth, error_handler=err, ): - parent = parents_by_loc.pop(top) + if TRACE_DEEP: + logger_debug(f"parents_by_loc: {parents_by_loc}") + try: + parent = parents_by_loc.pop(top) + except KeyError: + raise Exception(parents_by_loc, includes, root.location, ) for created in self._create_resources( parent=parent, top=top, @@ -634,7 +643,7 @@ def err(_error): ): # on the plain, bare FS, files cannot be parents if not created.is_file: - parents_by_loc[created.location] = created + parents_by_loc[to_os_native_path(created.location)] = created def _create_resources(self, parent, top, dirs, files, skip_ignored=skip_ignored): """ diff --git a/src/commoncode/system.py b/src/commoncode/system.py index 0e82a70417f..6bbc8535aef 100644 --- a/src/commoncode/system.py +++ b/src/commoncode/system.py @@ -13,6 +13,15 @@ from commoncode.distro import parse_os_release +def to_os_native_path(path): + """ + Normalize a path to use the native OS path separator. + """ + OS_PATH_SEP = "\\" if on_windows else "/" + + return path.replace("/", OS_PATH_SEP).replace("\\", OS_PATH_SEP).rstrip(OS_PATH_SEP) + + def os_arch(): """ Return a tuple for the current the OS and architecture. diff --git a/src/commoncode/testcase.py b/src/commoncode/testcase.py index ee680e24da2..9db033ceb80 100644 --- a/src/commoncode/testcase.py +++ b/src/commoncode/testcase.py @@ -29,7 +29,7 @@ from commoncode.archive import extract_zip_raw from commoncode.archive import tar_can_extract # NOQA from commoncode.system import on_posix -from commoncode.system import on_windows +from commoncode.system import to_os_native_path # a base test dir specific to a given test run # to ensure that multiple tests run can be launched in parallel @@ -39,15 +39,6 @@ timing_threshold = sys.maxsize -def to_os_native_path(path): - """ - Normalize a path to use the native OS path separator. - """ - OS_PATH_SEP = "\\" if on_windows else "/" - - return path.replace("/", OS_PATH_SEP).replace("\\", OS_PATH_SEP).rstrip(OS_PATH_SEP) - - def get_test_loc( test_path, test_data_dir, diff --git a/tests/scancode/data/summaries/multiple-input-expected.json b/tests/scancode/data/summaries/multiple-input-expected.json new file mode 100644 index 00000000000..0c0fe6169fb --- /dev/null +++ b/tests/scancode/data/summaries/multiple-input-expected.json @@ -0,0 +1,1278 @@ +{ + "files": [ + { + "path": "summaries", + "type": "directory", + "name": "summaries", + "base_name": "summaries", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 35, + "dirs_count": 13, + "size_count": 1161085, + "scan_errors": [] + }, + { + "path": "summaries/client", + "type": "directory", + "name": "client", + "base_name": "client", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 2, + "dirs_count": 1, + "size_count": 2, + "scan_errors": [] + }, + { + "path": "summaries/client/Images", + "type": "directory", + "name": "Images", + "base_name": "Images", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 2, + "dirs_count": 0, + "size_count": 2, + "scan_errors": [] + }, + { + "path": "summaries/client/Images/applicationCache.png", + "type": "file", + "name": "applicationCache.png", + "base_name": "applicationCache", + "extension": ".png", + "size": 1, + "date": "2026-05-22", + "sha1": "adc83b19e793491b1c6ea0fd8b46cd9f32e592fc", + "md5": "68b329da9893e34099c7d8ad5cb9c940", + "sha256": "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b", + "sha1_git": "8b137891791fe96927ad78e64b0aad7bded08bdc", + "mime_type": "application/octet-stream", + "file_type": "very short file (no magic)", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/client/Images/spinner.gif", + "type": "file", + "name": "spinner.gif", + "base_name": "spinner", + "extension": ".gif", + "size": 1, + "date": "2026-05-22", + "sha1": "adc83b19e793491b1c6ea0fd8b46cd9f32e592fc", + "md5": "68b329da9893e34099c7d8ad5cb9c940", + "sha256": "01ba4719c80b6fe911b091a7c05124b64eeece964e09c058ef8f9805daca546b", + "sha1_git": "8b137891791fe96927ad78e64b0aad7bded08bdc", + "mime_type": "application/octet-stream", + "file_type": "very short file (no magic)", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts", + "type": "directory", + "name": "counts", + "base_name": "counts", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 33, + "dirs_count": 10, + "size_count": 1161083, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups", + "type": "directory", + "name": "JGroups", + "base_name": "JGroups", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 14, + "dirs_count": 2, + "size_count": 241228, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/EULA", + "type": "file", + "name": "EULA", + "base_name": "EULA", + "extension": "", + "size": 8156, + "date": "2026-05-22", + "sha1": "eb232aa0424eca9c4136904e6143b72aaa9cf4de", + "md5": "0be0aceb8296727efff0ac0bf8e6bdb3", + "sha256": "6ef829995515206ba682183a68f971f00ee91b6bd1b4427f76a6bf364969c1ae", + "sha1_git": "0dcb788ede5b2c0b1659c5c2f2bb0cb40e245fe1", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": "verilog", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/LICENSE", + "type": "file", + "name": "LICENSE", + "base_name": "LICENSE", + "extension": "", + "size": 26430, + "date": "2026-05-22", + "sha1": "e60c2e780886f95df9c9ee36992b8edabec00bcc", + "md5": "7fbc338309ac38fefcd64b04bb903e34", + "sha256": "a190dc9c8043755d90f8b0a75fa66b9e42d4af4c980bf5ddc633f0124db3cee7", + "sha1_git": "b1e3f5a2638797271cbc9b91b856c05ed6942c8f", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses", + "type": "directory", + "name": "licenses", + "base_name": "licenses", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 5, + "dirs_count": 0, + "size_count": 54552, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses/apache-1.1.txt", + "type": "file", + "name": "apache-1.1.txt", + "base_name": "apache-1.1", + "extension": ".txt", + "size": 2885, + "date": "2026-05-22", + "sha1": "6b5608d35c3e304532af43db8bbfc5947bef46a6", + "md5": "276982197c941f4cbf3d218546e17ae2", + "sha256": "b03079c80bc3657f4b9d838f02f036e4611693a0e42b043d5d71b45ac6c5040d", + "sha1_git": "dae2270c2c0118eef91e8a6c841299983b71e771", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses/apache-2.0.txt", + "type": "file", + "name": "apache-2.0.txt", + "base_name": "apache-2.0", + "extension": ".txt", + "size": 11560, + "date": "2026-05-22", + "sha1": "47b573e3824cd5e02a1a3ae99e2735b49e0256e4", + "md5": "d273d63619c9aeaf15cdaf76422c4f87", + "sha256": "3ddf9be5c28fe27dad143a5dc76eea25222ad1dd68934a047064e56ed2fa40c5", + "sha1_git": "75b52484ea471f882c29e02693b4f02dba175b5e", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses/bouncycastle.txt", + "type": "file", + "name": "bouncycastle.txt", + "base_name": "bouncycastle", + "extension": ".txt", + "size": 1186, + "date": "2026-05-22", + "sha1": "74facb0e9a734479f9cd893b5be3fe1bf651b760", + "md5": "9fffd8de865a5705969f62b128381f85", + "sha256": "3d469c451a2a0e97380b90143d979281fadd39be55432b903e6bd18b1b9915d4", + "sha1_git": "3cf73c2f03238a23b56389c301deece6ab625b20", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses/cpl-1.0.txt", + "type": "file", + "name": "cpl-1.0.txt", + "base_name": "cpl-1.0", + "extension": ".txt", + "size": 11987, + "date": "2026-05-22", + "sha1": "681cf776bcd79752543d42490ec7ed22a29fd888", + "md5": "9a6d2c9ae73d59eb3dd38e3909750d14", + "sha256": "d9a768a23056b25ab4b0b48381003ce55f0d32514da5a4e017fa0765b3a887aa", + "sha1_git": "2243be15b296d7f00716bfb6e909d7325dbca0a8", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/licenses/lgpl.txt", + "type": "file", + "name": "lgpl.txt", + "base_name": "lgpl", + "extension": ".txt", + "size": 26934, + "date": "2026-05-22", + "sha1": "8f1a637d2e2ed1bdb9eb01a7dccb5c12cc0557e1", + "md5": "f14599a2f089f6ff8c97e2baa4e3d575", + "sha256": "885a03f54b157961236f46843e79972abfcd6890b6cbb368bc7eca328ff95a12", + "sha1_git": "cbee875ba6ddb0dadab286daf7ccec2f6f64191f", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src", + "type": "directory", + "name": "src", + "base_name": "src", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 7, + "dirs_count": 0, + "size_count": 152090, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/FixedMembershipToken.java", + "type": "file", + "name": "FixedMembershipToken.java", + "base_name": "FixedMembershipToken", + "extension": ".java", + "size": 5144, + "date": "2026-05-22", + "sha1": "5901f73dcc78155a1a2c7b5663a3a11fba400b19", + "md5": "aca9640ec8beee21b098bcf8ecc91442", + "sha256": "aac525060867f5004c7343690f1c197c9a678b334d402e0e9fd117c8b2df73f2", + "sha1_git": "46cf578d6de505d076c7ed49cc791f6597b6f4a9", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/GuardedBy.java", + "type": "file", + "name": "GuardedBy.java", + "base_name": "GuardedBy", + "extension": ".java", + "size": 813, + "date": "2026-05-22", + "sha1": "981d67087e65e9a44957c026d4b10817cf77d966", + "md5": "c5064400f759d3e81771005051d17dc1", + "sha256": "7c3e384429f27692534184e1511f70416c04c3f0b30be632710101840996695a", + "sha1_git": "6d9a9ec4a3f12a5619dd42cd560f36fd271fea43", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/ImmutableReference.java", + "type": "file", + "name": "ImmutableReference.java", + "base_name": "ImmutableReference", + "extension": ".java", + "size": 1838, + "date": "2026-05-22", + "sha1": "30f56b876d5576d9869e2c5c509b08db57110592", + "md5": "48ca3c72fb9a65c771a321222f118b88", + "sha256": "8a3fb390d4932a92c56e7b999b63b8e5ab55cbe81f65b27439296f279d160bd1", + "sha1_git": "50c720e0bf04f3b06fc8ef4bf7d176c41d6839bc", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/RATE_LIMITER.java", + "type": "file", + "name": "RATE_LIMITER.java", + "base_name": "RATE_LIMITER", + "extension": ".java", + "size": 3692, + "date": "2026-05-22", + "sha1": "a8087e5d50da3273536ebda9b87b77aa4ff55deb", + "md5": "4626bdbc48871b55513e1a12991c61a8", + "sha256": "80709043c6c1f4fbd6e7a43c9381da034ab9b67e2e6fee80973a0d4fd33664e0", + "sha1_git": "d0765aa5f296c5f9711b279014331f62ea6f43f4", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/RouterStub.java", + "type": "file", + "name": "RouterStub.java", + "base_name": "RouterStub", + "extension": ".java", + "size": 9913, + "date": "2026-05-22", + "sha1": "c1f6818f8ee7bddcc9f444bc94c099729d716d52", + "md5": "eecfe23494acbcd8088c93bc1e83c7f2", + "sha256": "f212de138e8cb0b7eb13521d8ed2620bc41af55093b857da753d7753b1d3438d", + "sha1_git": "1e0b9f9ef4c063cb7e62e9ddd9abf6a596ef7faa", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/RouterStubManager.java", + "type": "file", + "name": "RouterStubManager.java", + "base_name": "RouterStubManager", + "extension": ".java", + "size": 8162, + "date": "2026-05-22", + "sha1": "eb419dc94cfe11ca318a3e743a7f9f080e70c751", + "md5": "20bee9631b7c82a45c250e095352aec7", + "sha256": "c39a40d4057256a8fe70f2b69e5f940edcaf8b377b546d537e799ecff3f58b81", + "sha1_git": "47153252434d56c35406e63207e4a6a393fa508f", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/JGroups/src/S3_PING.java", + "type": "file", + "name": "S3_PING.java", + "base_name": "S3_PING", + "extension": ".java", + "size": 122528, + "date": "2026-05-22", + "sha1": "08dba9986f69719970ead3592dc565465164df0d", + "md5": "83d8324f37d0e3f120bc89865cf0bd39", + "sha256": "c4d59a8837c6320788c74496201e3ecc0ff2100525ebb727bcae6d855b34c548", + "sha1_git": "2f93ec6cc9cb3cf384268b2bce073a9c4fc152f5", + "mime_type": "text/x-java", + "file_type": "Java source, ASCII text", + "programming_language": "Java", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/README", + "type": "file", + "name": "README", + "base_name": "README", + "extension": "", + "size": 236, + "date": "2026-05-22", + "sha1": "2e07e32c52d607204fad196052d70e3d18fb8636", + "md5": "effc6856ef85a9250fb1a470792b3f38", + "sha256": "165da86bfdf296cd5a0a3e20c1d1ee86d70ecb8a1fa579d6f8cadad8eee85878", + "sha1_git": "1d61df81ffb14fd19f1ac10344a51755e8719282", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/arch", + "type": "directory", + "name": "arch", + "base_name": "arch", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 1, + "dirs_count": 0, + "size_count": 28103, + "scan_errors": [] + }, + { + "path": "summaries/counts/arch/zlib.tar.gz", + "type": "file", + "name": "zlib.tar.gz", + "base_name": "zlib", + "extension": ".tar.gz", + "size": 28103, + "date": "2026-05-22", + "sha1": "576f0ccfe534d7f5ff5d6400078d3c6586de3abd", + "md5": "20b2370751abfc08bb3556c1d8114b5a", + "sha256": "e6bb199f3b59fffac4092542a516a46b7f922e607d754c21ef5b27334b1f3ba6", + "sha1_git": "b57920bb555f6881693d57da741cd1cce9cf2847", + "mime_type": "application/gzip", + "file_type": "gzip compressed data, last modified: Wed Jul 15 09:08:19 2015, from Unix, original size modulo 2^32 103424", + "programming_language": null, + "is_binary": true, + "is_text": false, + "is_archive": true, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/screenshot.png", + "type": "file", + "name": "screenshot.png", + "base_name": "screenshot", + "extension": ".png", + "size": 622754, + "date": "2026-05-22", + "sha1": "01ff4b1de0bc6c75c9cca6e46c80c1802d6976d4", + "md5": "b6ef5a90777147423c98b42a6a25e57a", + "sha256": "a1c9905b77a8ff7e72c93abc85d32d9e43353996710b83c5bfa581c5f2af60ad", + "sha1_git": "97155e4a9b903a58abf29d62925d8db01c748a2e", + "mime_type": "image/png", + "file_type": "PNG image data, 2880 x 1666, 8-bit/color RGB, non-interlaced", + "programming_language": null, + "is_binary": true, + "is_text": false, + "is_archive": false, + "is_media": true, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib", + "type": "directory", + "name": "zlib", + "base_name": "zlib", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 16, + "dirs_count": 5, + "size_count": 268762, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/ada", + "type": "directory", + "name": "ada", + "base_name": "ada", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 1, + "dirs_count": 0, + "size_count": 13594, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/ada/zlib.ads", + "type": "file", + "name": "zlib.ads", + "base_name": "zlib", + "extension": ".ads", + "size": 13594, + "date": "2026-05-22", + "sha1": "0245a91806d804bf9f0907a3a001a141e9adb61b", + "md5": "71de2670f2e588b51c62e7f6a9046399", + "sha256": "02634bec0d5e4c69d8d2859124380074a57de8d8bd928398379bfacc514236d2", + "sha1_git": "79ffc4095cf46f90a30334466637b4df61dfaa5b", + "mime_type": "text/plain", + "file_type": "ASCII text", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/adler32.c", + "type": "file", + "name": "adler32.c", + "base_name": "adler32", + "extension": ".c", + "size": 4968, + "date": "2026-05-22", + "sha1": "0cff4808476ce0b5f6f0ebbc69ee2ab2a0eebe43", + "md5": "ae3bbb54820e1d49fb90cbba222e973f", + "sha256": "341d49ae2703037d2d10c8486f1a1ca3b65e0f10cc9e5fead6bfbbc0b34564ba", + "sha1_git": "a868f073d8a0e35dcb3ec812b41b1d3f0acdd84d", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/deflate.c", + "type": "file", + "name": "deflate.c", + "base_name": "deflate", + "extension": ".c", + "size": 71476, + "date": "2026-05-22", + "sha1": "7b4ace6d698c5dbbfb9a8f047f63228ca54d2e77", + "md5": "cd7826278ce9d9d9ed5abdefef50c3e2", + "sha256": "565e68ddfff5af8efd55f71e122b860ad11527a7d9de40a76af2b16afef24cc0", + "sha1_git": "696957705b756b1457a18c7a23a91affafa17d91", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/deflate.h", + "type": "file", + "name": "deflate.h", + "base_name": "deflate", + "extension": ".h", + "size": 12774, + "date": "2026-05-22", + "sha1": "29ed3b8ca3927576e5889dea5880ca0052942c7d", + "md5": "7ceae74a13201f14c91623116af169c3", + "sha256": "80570c8052491bdc7583600da28a8f1cb32c27ab1cec107ec12c83255d426cf7", + "sha1_git": "ce0299edd19168b97e38667479bd1b5e769a63d0", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/dotzlib", + "type": "directory", + "name": "dotzlib", + "base_name": "dotzlib", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 4, + "dirs_count": 0, + "size_count": 14257, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/dotzlib/AssemblyInfo.cs", + "type": "file", + "name": "AssemblyInfo.cs", + "base_name": "AssemblyInfo", + "extension": ".cs", + "size": 2500, + "date": "2026-05-22", + "sha1": "9f1db1177b2e9a014f72bb3cd80be17133e06d16", + "md5": "23d0d7c18846fc31655b6aa89b7c8038", + "sha256": "314afcfb339ea95f5431047b7ab24631b11c3532c7ce5dc2094ed0cf80a7c16d", + "sha1_git": "0491bfc2b036f179f9d3a2f37fd61d9b3b8dd779", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": "C#", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/dotzlib/ChecksumImpl.cs", + "type": "file", + "name": "ChecksumImpl.cs", + "base_name": "ChecksumImpl", + "extension": ".cs", + "size": 8040, + "date": "2026-05-22", + "sha1": "3807a0e24a57b92ea301559cab7307b8eab52c51", + "md5": "d01b3cb2e75da9b15f05b92b42f6bd33", + "sha256": "e7c047a2c3bcf88d3d002ee3d2d05af414acf53cb4451efacc0f2e95a474ea0f", + "sha1_git": "788b2fcecedb07801588b0e7f6be89b66e4e1e72", + "mime_type": "text/x-c++", + "file_type": "C++ source, ISO-8859 text, with CRLF line terminators", + "programming_language": "C#", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/dotzlib/LICENSE_1_0.txt", + "type": "file", + "name": "LICENSE_1_0.txt", + "base_name": "LICENSE_1_0", + "extension": ".txt", + "size": 1359, + "date": "2026-05-22", + "sha1": "892b34f7865d90a6f949f50d95e49625a10bc7f0", + "md5": "81543b22c36f10d20ac9712f8d80ef8d", + "sha256": "36266a8fd073568394cb81cdb2b124f7fdae2c64c1a7ed09db34b4d22efa2951", + "sha1_git": "30aac2cf4793f3aad92ef0a3c88731198c39566e", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/dotzlib/readme.txt", + "type": "file", + "name": "readme.txt", + "base_name": "readme", + "extension": ".txt", + "size": 2358, + "date": "2026-05-22", + "sha1": "b1229b826f0096808628474538cea8fec2922a9b", + "md5": "1f20f3168ee63d90de033edac2ce383c", + "sha256": "d04972a91b1563fb4b7acab4b9ff2b84e57368953cc0596d5f5ea17d97315fd0", + "sha1_git": "b2395720d4c5693213001c449ed09869be9bd944", + "mime_type": "text/plain", + "file_type": "ASCII text, with CRLF line terminators", + "programming_language": null, + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/gcc_gvmat64", + "type": "directory", + "name": "gcc_gvmat64", + "base_name": "gcc_gvmat64", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 1, + "dirs_count": 0, + "size_count": 16413, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/gcc_gvmat64/gvmat64.S", + "type": "file", + "name": "gvmat64.S", + "base_name": "gvmat64", + "extension": ".S", + "size": 16413, + "date": "2026-05-22", + "sha1": "742603cba1af98a1432cc02efb019b1a5760adf2", + "md5": "5e772d7302475e5473d0c4c57b9861e8", + "sha256": "22ff411b8b1d1b04aeaa8418b68245400267dc43c6f44104f6ccd37f0daee89f", + "sha1_git": "dd858ddbd16b031aa8aed0794ab120a647b97818", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text, with CRLF line terminators", + "programming_language": "GAS", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/infback9", + "type": "directory", + "name": "infback9", + "base_name": "infback9", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 2, + "dirs_count": 0, + "size_count": 23223, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/infback9/infback9.c", + "type": "file", + "name": "infback9.c", + "base_name": "infback9", + "extension": ".c", + "size": 21629, + "date": "2026-05-22", + "sha1": "17fb362c03755b12f2dda5b12a68cf38162674bd", + "md5": "23ff5edec0817da303cb1294c1e4205c", + "sha256": "0a715c85a1ce3bb8b5a18d60941ffabc0186a886bcc66ba2ee0c4115a8e274e9", + "sha1_git": "05fb3e338070d67054858cd2fe469e3bbb2044a3", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/infback9/infback9.h", + "type": "file", + "name": "infback9.h", + "base_name": "infback9", + "extension": ".h", + "size": 1594, + "date": "2026-05-22", + "sha1": "d0486a32b558dcaceded5f0746fad62e680a4734", + "md5": "52b1ed99960d3ed7ed60cd20295e64a8", + "sha256": "dda2302f28157fe43a6143f84802af1740393572c2766559593996fd7a5a3245", + "sha1_git": "1073c0a38e6c2c7f51d7638135a08f1471d7320c", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/iostream2", + "type": "directory", + "name": "iostream2", + "base_name": "iostream2", + "extension": "", + "size": 0, + "date": null, + "sha1": null, + "md5": null, + "sha256": null, + "sha1_git": null, + "mime_type": null, + "file_type": null, + "programming_language": null, + "is_binary": false, + "is_text": false, + "is_archive": false, + "is_media": false, + "is_source": false, + "is_script": false, + "files_count": 2, + "dirs_count": 0, + "size_count": 9994, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/iostream2/zstream.h", + "type": "file", + "name": "zstream.h", + "base_name": "zstream", + "extension": ".h", + "size": 9283, + "date": "2026-05-22", + "sha1": "fca4540d490fff36bb90fd801cf9cd8fc695bb17", + "md5": "a980b61c1e8be68d5cdb1236ba6b43e7", + "sha256": "d0343e0c57ff58008b6f29643d289c72713aa2d653fe3dcd2e939fc77e7e20b6", + "sha1_git": "43d2332b79b70bb8ead6d84838e6841e349ec818", + "mime_type": "text/x-c++", + "file_type": "C++ source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/iostream2/zstream_test.cpp", + "type": "file", + "name": "zstream_test.cpp", + "base_name": "zstream_test", + "extension": ".cpp", + "size": 711, + "date": "2026-05-22", + "sha1": "e18a6d55cbbd8b832f8d795530553467e5c74fcf", + "md5": "d32476bde4e6d5f889092fdff6f8cdb0", + "sha256": "f789df183cc58b78751985466380c656308490a9036eb48a7ef79704c3d3f229", + "sha1_git": "6273f62d62a8fa280edcfb798a013e0a0ae84534", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C++", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/zlib.h", + "type": "file", + "name": "zlib.h", + "base_name": "zlib", + "extension": ".h", + "size": 87883, + "date": "2026-05-22", + "sha1": "400d35465f179a4acacb5fe749e6ce20a0bbdb84", + "md5": "64d8a5180bd54ff5452886e4cbb21e14", + "sha256": "726b0569915917b967f87f3f08a1eec039101bf9dcc29d61c0b2b0b8f271b58d", + "sha1_git": "3e0c7672ac51d93782f020bba32eb1207617e70a", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/zutil.c", + "type": "file", + "name": "zutil.c", + "base_name": "zutil", + "extension": ".c", + "size": 7414, + "date": "2026-05-22", + "sha1": "e1af709bff21ae0d4331119a7fc4c19f82932043", + "md5": "fff257bc1656eb60fc585a7dc35f963d", + "sha256": "c5e9927d5a1a1dec514ccdcedfa1e0f01664c58bb33166b4997b50b8001f1d6c", + "sha1_git": "23d2ebef008fdcc00833eba0d9abcd7b9c665531", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + }, + { + "path": "summaries/counts/zlib/zutil.h", + "type": "file", + "name": "zutil.h", + "base_name": "zutil", + "extension": ".h", + "size": 6766, + "date": "2026-05-22", + "sha1": "b909d27ef9ce51639f76b7ea6b62721e7d1b6bf7", + "md5": "04fcfbb961591c9452c4d0fd1525ffdf", + "sha256": "91cce8e78e83bcdb8c6acb98d4f0686dbdc81ca97d4a36a60c0b48f7ef78f1af", + "sha1_git": "24ab06b1cf60aeba4ade9ab36ff7ad5f73541960", + "mime_type": "text/x-c", + "file_type": "C source, ASCII text", + "programming_language": "C", + "is_binary": false, + "is_text": true, + "is_archive": false, + "is_media": false, + "is_source": true, + "is_script": false, + "files_count": 0, + "dirs_count": 0, + "size_count": 0, + "scan_errors": [] + } + ] +} \ No newline at end of file diff --git a/tests/scancode/test_cli.py b/tests/scancode/test_cli.py index 0dca907efcf..6e576186ded 100644 --- a/tests/scancode/test_cli.py +++ b/tests/scancode/test_cli.py @@ -838,11 +838,13 @@ def test_scan_should_not_fail_with_low_max_in_memory_setting_when_ignoring_files def test_scan_supports_multiple_input_paths(): - test_file_1 = test_env.get_test_loc('summaries/client', relative=True) - test_file_2 = test_env.get_test_loc('summaries/counts', relative=True) + test_file_1 = test_env.get_test_loc('summaries/client', relative=True).strip("\\") + test_file_2 = test_env.get_test_loc('summaries/counts', relative=True).strip("\\") result_file = test_env.get_temp_file('json') args = ['--info', '-n', '1', test_file_1, test_file_2, '--json', result_file] run_scan_click(args, expected_rc=0) + expected = test_env.get_test_loc('summaries/multiple-input-expected.json') + check_json_scan(expected_file=expected, result_file=result_file, regen=REGEN_TEST_FIXTURES, remove_file_date=True)