From ef035f38dcf20aac0f969607befccddf55efceb7 Mon Sep 17 00:00:00 2001 From: Jose Javier Merchante Date: Wed, 12 Mar 2025 12:51:38 +0100 Subject: [PATCH 1/2] [metrics] Add binary file type detection This commit adds binary file type detection, identifying some popular binary executables and compressed binary files. Signed-off-by: Jose Javier Merchante --- tests/data/events.json | 14 ++++++++++++++ tests/end_to_end/test_cli.py | 12 ++++++++++-- tests/unit/test_metrics.py | 1 + trustable_cli/metrics.py | 11 +++++++++-- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/tests/data/events.json b/tests/data/events.json index d2140d2..5e35caa 100644 --- a/tests/data/events.json +++ b/tests/data/events.json @@ -675,6 +675,20 @@ "added": "85", "removed": "0" }, + { + "modes": [ + "000000", + "100644" + ], + "indexes": [ + "0000000", + "06ee9fa" + ], + "action": "A", + "file": "sample.tar.gz", + "added": "0", + "removed": "0" + }, { "modes": [ "000000", diff --git a/tests/end_to_end/test_cli.py b/tests/end_to_end/test_cli.py index dde3388..2f9c7d5 100644 --- a/tests/end_to_end/test_cli.py +++ b/tests/end_to_end/test_cli.py @@ -72,6 +72,7 @@ def test_metrics(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) + self.assertNotIn("file_types_binary", quickstart_metrics) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) @@ -92,7 +93,8 @@ def test_metrics(self): self.assertEqual(angular_metrics["total_contributors"], 58) self.assertEqual(angular_metrics["pony_factor"], 5) self.assertEqual(angular_metrics["elephant_factor"], 2) - self.assertEqual(angular_metrics["file_types_other"], 538) + self.assertEqual(angular_metrics["file_types_other"], 534) + self.assertEqual(angular_metrics["file_types_binary"], 4) self.assertEqual(angular_metrics["file_types_code"], 2129) self.assertEqual(angular_metrics["commit_size_added_lines"], 218483) self.assertEqual(angular_metrics["commit_size_removed_lines"], 245784) @@ -147,6 +149,7 @@ def test_from_date(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 1) self.assertEqual(quickstart_metrics["file_types_other"], 37) + self.assertNotIn("file_types_binary", quickstart_metrics) self.assertEqual(quickstart_metrics["file_types_code"], 17) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 269) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 103) @@ -168,6 +171,7 @@ def test_from_date(self): self.assertEqual(angular_metrics["pony_factor"], 1) self.assertEqual(angular_metrics["elephant_factor"], 1) self.assertEqual(angular_metrics["file_types_other"], 24) + self.assertNotIn("file_types_binary", angular_metrics) self.assertEqual(angular_metrics["file_types_code"], 13) self.assertEqual(angular_metrics["commit_size_added_lines"], 4849) self.assertEqual(angular_metrics["commit_size_removed_lines"], 149) @@ -223,6 +227,7 @@ def test_to_date(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 646) + self.assertNotIn("file_types_binary", quickstart_metrics) self.assertEqual(quickstart_metrics["file_types_code"], 462) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 52852) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51749) @@ -243,7 +248,8 @@ def test_to_date(self): self.assertEqual(angular_metrics["total_contributors"], 56) self.assertEqual(angular_metrics["pony_factor"], 5) self.assertEqual(angular_metrics["elephant_factor"], 2) - self.assertEqual(angular_metrics["file_types_other"], 514) + self.assertEqual(angular_metrics["file_types_other"], 510) + self.assertEqual(angular_metrics["file_types_binary"], 4) self.assertEqual(angular_metrics["file_types_code"], 2116) self.assertEqual(angular_metrics["commit_size_added_lines"], 213634) self.assertEqual(angular_metrics["commit_size_removed_lines"], 245635) @@ -299,6 +305,7 @@ def test_duplicate_repo(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) + self.assertNotIn("file_types_binary", quickstart_metrics) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) @@ -354,6 +361,7 @@ def test_non_git_repo(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) + self.assertNotIn("file_types_binary", quickstart_metrics) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py index ee4925f..8ef586e 100644 --- a/tests/unit/test_metrics.py +++ b/tests/unit/test_metrics.py @@ -122,6 +122,7 @@ def test_file_type_metrics(self): file_metrics = self.analyzer.get_file_type_metrics() self.assertEqual(file_metrics.get("code", 0), 54) + self.assertEqual(file_metrics.get("binary", 0), 1) self.assertEqual(file_metrics.get("other", 0), 24) def test_commit_size_metrics(self): diff --git a/trustable_cli/metrics.py b/trustable_cli/metrics.py index 641d431..f153e6e 100644 --- a/trustable_cli/metrics.py +++ b/trustable_cli/metrics.py @@ -40,8 +40,13 @@ COMMIT_EVENT_TYPE = "org.grimoirelab.events.git.commit" AUTHOR_FIELD = "Author" FILE_TYPE_CODE = ( - r"\.bazel$|\.bazelrc$|\.bzl$|\.c$|\.cc$|\.cp$|\.cpp$|\.cxx$|\.c\+\+$|" - r"\.go$|\.h$|\.js$|\.mjs$|\.java$|\.py$|\.rs$|\.sh$|\.tf$|\.ts$" + r"\.bazel$|\.bazelrc$|\.bzl$|\.c$|\.cc$|\.cp$|\.cpp$|\.cs$\|\.cxx$|\.c\+\+$|" + r"\.go$|\.h$|\.hpp$|\.js$|\.mjs$|\.java$|\.pl$|\.py$|\.rs$|\.sh$|\.tf$|\.ts$" +) +FILE_TYPE_BINARY = ( + r"\.7z$|\.a$|\.abb$|\.apk$|\.app$|\.appx$|\.arc$|\.bin$|\.bz2$|\.class$|\.deb$|" + r"\.dll$|\.dmg$|\.exe$|\.gz$|\.ipa$|\.iso$|\.jar$|\.lib$|\.msi$|\.o$|\.obj$|\.rar$|" + r"\.rpm$|\.so$|\.tar$|\.xar$|\.xz$|\.zip$|\.zst$|\.Z$" ) @@ -192,6 +197,8 @@ def _update_file_metrics(self, event): # File type metrics if re.search(FILE_TYPE_CODE, file["file"]): self.file_types["code"] += 1 + elif re.search(FILE_TYPE_BINARY, file["file"]): + self.file_types["binary"] += 1 else: self.file_types["other"] += 1 From f490107c47cbacdf235fa7739424c05ab3cd055d Mon Sep 17 00:00:00 2001 From: Jose Javier Merchante Date: Wed, 12 Mar 2025 17:33:51 +0100 Subject: [PATCH 2/2] [metrics] Add support for custom file type patterns This commit allows to define custom file type patterns for code and binary files. Signed-off-by: Jose Javier Merchante --- README.md | 4 +++- tests/end_to_end/test_cli.py | 12 ++++++------ tests/unit/test_metrics.py | 28 +++++++++++++++++++++++----- trustable_cli/cli.py | 24 ++++++++++++++++++------ trustable_cli/metrics.py | 18 ++++++++++++------ 5 files changed, 62 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 2b2f66d..d70cb3e 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,10 @@ trustable spdx.xml \ --grimoirelab-user user --grimoirelab-password password \ --opensearch-url https://admin:admin@127.0.0.1:9200 \ --opensearch-index events \ - --output metrics.json \ --repository-timeout 3600 + --code-file-pattern "\.py$|\.js$" \ + --binary-file-pattern "\.exe$|\.tar$" \ + --output metrics.json ``` The parameters needed to run the tool are: diff --git a/tests/end_to_end/test_cli.py b/tests/end_to_end/test_cli.py index 2f9c7d5..45b53da 100644 --- a/tests/end_to_end/test_cli.py +++ b/tests/end_to_end/test_cli.py @@ -72,7 +72,7 @@ def test_metrics(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) - self.assertNotIn("file_types_binary", quickstart_metrics) + self.assertEqual(quickstart_metrics["file_types_binary"], 0) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) @@ -149,7 +149,7 @@ def test_from_date(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 1) self.assertEqual(quickstart_metrics["file_types_other"], 37) - self.assertNotIn("file_types_binary", quickstart_metrics) + self.assertEqual(quickstart_metrics["file_types_binary"], 0) self.assertEqual(quickstart_metrics["file_types_code"], 17) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 269) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 103) @@ -171,7 +171,7 @@ def test_from_date(self): self.assertEqual(angular_metrics["pony_factor"], 1) self.assertEqual(angular_metrics["elephant_factor"], 1) self.assertEqual(angular_metrics["file_types_other"], 24) - self.assertNotIn("file_types_binary", angular_metrics) + self.assertEqual(angular_metrics["file_types_binary"], 0) self.assertEqual(angular_metrics["file_types_code"], 13) self.assertEqual(angular_metrics["commit_size_added_lines"], 4849) self.assertEqual(angular_metrics["commit_size_removed_lines"], 149) @@ -227,7 +227,7 @@ def test_to_date(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 646) - self.assertNotIn("file_types_binary", quickstart_metrics) + self.assertEqual(quickstart_metrics["file_types_binary"], 0) self.assertEqual(quickstart_metrics["file_types_code"], 462) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 52852) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51749) @@ -305,7 +305,7 @@ def test_duplicate_repo(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) - self.assertNotIn("file_types_binary", quickstart_metrics) + self.assertEqual(quickstart_metrics["file_types_binary"], 0) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) @@ -361,7 +361,7 @@ def test_non_git_repo(self): self.assertEqual(quickstart_metrics["pony_factor"], 2) self.assertEqual(quickstart_metrics["elephant_factor"], 2) self.assertEqual(quickstart_metrics["file_types_other"], 683) - self.assertNotIn("file_types_binary", quickstart_metrics) + self.assertEqual(quickstart_metrics["file_types_binary"], 0) self.assertEqual(quickstart_metrics["file_types_code"], 479) self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121) self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852) diff --git a/tests/unit/test_metrics.py b/tests/unit/test_metrics.py index 8ef586e..e922d79 100644 --- a/tests/unit/test_metrics.py +++ b/tests/unit/test_metrics.py @@ -116,14 +116,32 @@ def test_get_elephant_factor(self): def test_file_type_metrics(self): """Test that file type metrics are calculated correctly""" - self.assertEqual(self.analyzer.get_file_type_metrics(), {}) - self.analyzer.process_events(self.events) file_metrics = self.analyzer.get_file_type_metrics() - self.assertEqual(file_metrics.get("code", 0), 54) - self.assertEqual(file_metrics.get("binary", 0), 1) - self.assertEqual(file_metrics.get("other", 0), 24) + self.assertEqual(file_metrics["code"], 54) + self.assertEqual(file_metrics["binary"], 1) + self.assertEqual(file_metrics["other"], 24) + + def test_file_type_metrics_empty(self): + """Test that file type metrics are calculated correctly without events""" + + file_metrics = self.analyzer.get_file_type_metrics() + self.assertEqual(file_metrics["code"], 0) + self.assertEqual(file_metrics["binary"], 0) + self.assertEqual(file_metrics["other"], 0) + + def test_file_type_metrics_new_regex(self): + """Test that file type metrics are calculated correctly with new regex""" + + analyzer = GitEventsAnalyzer(code_file_pattern=r"\.py$", binary_file_pattern=r"\.md$") + + analyzer.process_events(self.events) + + file_metrics = analyzer.get_file_type_metrics() + self.assertEqual(file_metrics["code"], 53) + self.assertEqual(file_metrics["binary"], 4) + self.assertEqual(file_metrics["other"], 22) def test_commit_size_metrics(self): """Test that commit size metrics are calculated correctly""" diff --git a/trustable_cli/cli.py b/trustable_cli/cli.py index 96114a1..f31bcaa 100755 --- a/trustable_cli/cli.py +++ b/trustable_cli/cli.py @@ -77,6 +77,8 @@ ) @click.option("--verify-certs", is_flag=True, default=False, help="Verify SSL/TLS certificates") @click.option("--verbose", is_flag=True, default=False, help="Increase output verbosity") +@click.option("--code-file-pattern", help="Regular expression to match code file types") +@click.option("--binary-file-pattern", help="Regular expression to match binary file types") def trustable_grimoirelab_score( filename: str, grimoirelab_url: str, @@ -90,6 +92,8 @@ def trustable_grimoirelab_score( to_date: datetime.datetime | None = None, verify_certs: bool = False, verbose: bool = False, + code_file_pattern: str | None = None, + binary_file_pattern: str | None = None, ) -> None: """Calculate metrics for Trustable using GrimoireLab. @@ -131,6 +135,8 @@ def trustable_grimoirelab_score( to_date=to_date, verify_certs=verify_certs, timeout=repository_timeout, + code_file_pattern=code_file_pattern, + binary_file_pattern=binary_file_pattern, ) package_metrics = {"packages": {}} @@ -203,6 +209,8 @@ def generate_metrics_when_ready( to_date: datetime.datetime | None = None, verify_certs: bool = False, timeout: int = 3600, + code_file_pattern: str | None = None, + binary_file_pattern: str | None = None, ) -> dict[str:Any]: """Generate metrics once the repositories have finished the collection. @@ -214,6 +222,8 @@ def generate_metrics_when_ready( :param to_date: End date for metrics. :param verify_certs: Verify SSL/TLS certificates. :param timeout: Seconds to wait before failing getting metrics + :param code_file_pattern: Regular expression to match code file types. + :param binary_file_pattern: Regular expression to match binary file types. """ logging.info("Generating metrics") @@ -228,12 +238,14 @@ def generate_metrics_when_ready( for repository in pending_repositories: if repository_ready(grimoirelab_client, repository, after_date): metrics["repositories"][repository] = get_repository_metrics( - repository, - opensearch_url, - opensearch_index, - from_date, - to_date, - verify_certs, + repository=repository, + opensearch_url=opensearch_url, + opensearch_index=opensearch_index, + from_date=from_date, + to_date=to_date, + verify_certs=verify_certs, + code_file_pattern=code_file_pattern, + binary_file_pattern=binary_file_pattern, ) processed.add(repository) diff --git a/trustable_cli/metrics.py b/trustable_cli/metrics.py index f153e6e..7ad0b44 100644 --- a/trustable_cli/metrics.py +++ b/trustable_cli/metrics.py @@ -51,14 +51,16 @@ class GitEventsAnalyzer: - def __init__(self): + def __init__(self, code_file_pattern: str | None = None, binary_file_pattern: str | None = None): self.total_commits: int = 0 self.contributors: Counter = Counter() self.companies: Counter = Counter() - self.file_types: dict = Counter() + self.file_types: dict = {"code": 0, "binary": 0, "other": 0} self.added_lines: int = 0 self.removed_lines: int = 0 self.messages_sizes: list = [] + self.re_code_pattern = re.compile(code_file_pattern or FILE_TYPE_CODE) + self.re_binary_pattern = re.compile(binary_file_pattern or FILE_TYPE_BINARY) def process_events(self, events: iter(dict[str, Any])): for event in events: @@ -116,7 +118,7 @@ def get_elephant_factor(self): def get_file_type_metrics(self): """Get the file type metrics""" - return dict(self.file_types) + return self.file_types def get_commit_size_metrics(self): """Get the commit size metrics""" @@ -195,9 +197,9 @@ def _update_file_metrics(self, event): if not file["file"]: continue # File type metrics - if re.search(FILE_TYPE_CODE, file["file"]): + if self.re_code_pattern.search(file["file"]): self.file_types["code"] += 1 - elif re.search(FILE_TYPE_BINARY, file["file"]): + elif self.re_binary_pattern.search(file["file"]): self.file_types["binary"] += 1 else: self.file_types["other"] += 1 @@ -226,6 +228,8 @@ def get_repository_metrics( from_date: datetime.datetime = None, to_date: datetime.datetime = None, verify_certs: bool = True, + code_file_pattern: str | None = None, + binary_file_pattern: str | None = None, ): """ Get the metrics from a repository. @@ -236,6 +240,8 @@ def get_repository_metrics( :param verify_certs: Boolean, verify SSL/TLS certificates, default True :param from_date: Start date, by default None :param to_date: End date, by default None + :param code_file_pattern: Regular expression to match code file types. + :param binary_file_pattern: Regular expression to match binary file types. """ os_conn = connect_to_opensearch(opensearch_url, verify_certs=verify_certs) @@ -243,7 +249,7 @@ def get_repository_metrics( events = get_repository_events(os_conn, opensearch_index, repository, from_date, to_date) - analyzer = GitEventsAnalyzer() + analyzer = GitEventsAnalyzer(code_file_pattern=code_file_pattern, binary_file_pattern=binary_file_pattern) analyzer.process_events(events) metrics["metrics"]["total_commits"] = analyzer.get_commit_count()