Skip to content

Commit f490107

Browse files
committed
[metrics] Add support for custom file type patterns
This commit allows to define custom file type patterns for code and binary files. Signed-off-by: Jose Javier Merchante <[email protected]>
1 parent ef035f3 commit f490107

File tree

5 files changed

+62
-24
lines changed

5 files changed

+62
-24
lines changed

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,8 +54,10 @@ trustable spdx.xml \
5454
--grimoirelab-user user --grimoirelab-password password \
5555
--opensearch-url https://admin:[email protected]:9200 \
5656
--opensearch-index events \
57-
--output metrics.json \
5857
--repository-timeout 3600
58+
--code-file-pattern "\.py$|\.js$" \
59+
--binary-file-pattern "\.exe$|\.tar$" \
60+
--output metrics.json
5961
```
6062

6163
The parameters needed to run the tool are:

tests/end_to_end/test_cli.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def test_metrics(self):
7272
self.assertEqual(quickstart_metrics["pony_factor"], 2)
7373
self.assertEqual(quickstart_metrics["elephant_factor"], 2)
7474
self.assertEqual(quickstart_metrics["file_types_other"], 683)
75-
self.assertNotIn("file_types_binary", quickstart_metrics)
75+
self.assertEqual(quickstart_metrics["file_types_binary"], 0)
7676
self.assertEqual(quickstart_metrics["file_types_code"], 479)
7777
self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121)
7878
self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852)
@@ -149,7 +149,7 @@ def test_from_date(self):
149149
self.assertEqual(quickstart_metrics["pony_factor"], 2)
150150
self.assertEqual(quickstart_metrics["elephant_factor"], 1)
151151
self.assertEqual(quickstart_metrics["file_types_other"], 37)
152-
self.assertNotIn("file_types_binary", quickstart_metrics)
152+
self.assertEqual(quickstart_metrics["file_types_binary"], 0)
153153
self.assertEqual(quickstart_metrics["file_types_code"], 17)
154154
self.assertEqual(quickstart_metrics["commit_size_added_lines"], 269)
155155
self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 103)
@@ -171,7 +171,7 @@ def test_from_date(self):
171171
self.assertEqual(angular_metrics["pony_factor"], 1)
172172
self.assertEqual(angular_metrics["elephant_factor"], 1)
173173
self.assertEqual(angular_metrics["file_types_other"], 24)
174-
self.assertNotIn("file_types_binary", angular_metrics)
174+
self.assertEqual(angular_metrics["file_types_binary"], 0)
175175
self.assertEqual(angular_metrics["file_types_code"], 13)
176176
self.assertEqual(angular_metrics["commit_size_added_lines"], 4849)
177177
self.assertEqual(angular_metrics["commit_size_removed_lines"], 149)
@@ -227,7 +227,7 @@ def test_to_date(self):
227227
self.assertEqual(quickstart_metrics["pony_factor"], 2)
228228
self.assertEqual(quickstart_metrics["elephant_factor"], 2)
229229
self.assertEqual(quickstart_metrics["file_types_other"], 646)
230-
self.assertNotIn("file_types_binary", quickstart_metrics)
230+
self.assertEqual(quickstart_metrics["file_types_binary"], 0)
231231
self.assertEqual(quickstart_metrics["file_types_code"], 462)
232232
self.assertEqual(quickstart_metrics["commit_size_added_lines"], 52852)
233233
self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51749)
@@ -305,7 +305,7 @@ def test_duplicate_repo(self):
305305
self.assertEqual(quickstart_metrics["pony_factor"], 2)
306306
self.assertEqual(quickstart_metrics["elephant_factor"], 2)
307307
self.assertEqual(quickstart_metrics["file_types_other"], 683)
308-
self.assertNotIn("file_types_binary", quickstart_metrics)
308+
self.assertEqual(quickstart_metrics["file_types_binary"], 0)
309309
self.assertEqual(quickstart_metrics["file_types_code"], 479)
310310
self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121)
311311
self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852)
@@ -361,7 +361,7 @@ def test_non_git_repo(self):
361361
self.assertEqual(quickstart_metrics["pony_factor"], 2)
362362
self.assertEqual(quickstart_metrics["elephant_factor"], 2)
363363
self.assertEqual(quickstart_metrics["file_types_other"], 683)
364-
self.assertNotIn("file_types_binary", quickstart_metrics)
364+
self.assertEqual(quickstart_metrics["file_types_binary"], 0)
365365
self.assertEqual(quickstart_metrics["file_types_code"], 479)
366366
self.assertEqual(quickstart_metrics["commit_size_added_lines"], 53121)
367367
self.assertEqual(quickstart_metrics["commit_size_removed_lines"], 51852)

tests/unit/test_metrics.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -116,14 +116,32 @@ def test_get_elephant_factor(self):
116116
def test_file_type_metrics(self):
117117
"""Test that file type metrics are calculated correctly"""
118118

119-
self.assertEqual(self.analyzer.get_file_type_metrics(), {})
120-
121119
self.analyzer.process_events(self.events)
122120

123121
file_metrics = self.analyzer.get_file_type_metrics()
124-
self.assertEqual(file_metrics.get("code", 0), 54)
125-
self.assertEqual(file_metrics.get("binary", 0), 1)
126-
self.assertEqual(file_metrics.get("other", 0), 24)
122+
self.assertEqual(file_metrics["code"], 54)
123+
self.assertEqual(file_metrics["binary"], 1)
124+
self.assertEqual(file_metrics["other"], 24)
125+
126+
def test_file_type_metrics_empty(self):
127+
"""Test that file type metrics are calculated correctly without events"""
128+
129+
file_metrics = self.analyzer.get_file_type_metrics()
130+
self.assertEqual(file_metrics["code"], 0)
131+
self.assertEqual(file_metrics["binary"], 0)
132+
self.assertEqual(file_metrics["other"], 0)
133+
134+
def test_file_type_metrics_new_regex(self):
135+
"""Test that file type metrics are calculated correctly with new regex"""
136+
137+
analyzer = GitEventsAnalyzer(code_file_pattern=r"\.py$", binary_file_pattern=r"\.md$")
138+
139+
analyzer.process_events(self.events)
140+
141+
file_metrics = analyzer.get_file_type_metrics()
142+
self.assertEqual(file_metrics["code"], 53)
143+
self.assertEqual(file_metrics["binary"], 4)
144+
self.assertEqual(file_metrics["other"], 22)
127145

128146
def test_commit_size_metrics(self):
129147
"""Test that commit size metrics are calculated correctly"""

trustable_cli/cli.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@
7777
)
7878
@click.option("--verify-certs", is_flag=True, default=False, help="Verify SSL/TLS certificates")
7979
@click.option("--verbose", is_flag=True, default=False, help="Increase output verbosity")
80+
@click.option("--code-file-pattern", help="Regular expression to match code file types")
81+
@click.option("--binary-file-pattern", help="Regular expression to match binary file types")
8082
def trustable_grimoirelab_score(
8183
filename: str,
8284
grimoirelab_url: str,
@@ -90,6 +92,8 @@ def trustable_grimoirelab_score(
9092
to_date: datetime.datetime | None = None,
9193
verify_certs: bool = False,
9294
verbose: bool = False,
95+
code_file_pattern: str | None = None,
96+
binary_file_pattern: str | None = None,
9397
) -> None:
9498
"""Calculate metrics for Trustable using GrimoireLab.
9599
@@ -131,6 +135,8 @@ def trustable_grimoirelab_score(
131135
to_date=to_date,
132136
verify_certs=verify_certs,
133137
timeout=repository_timeout,
138+
code_file_pattern=code_file_pattern,
139+
binary_file_pattern=binary_file_pattern,
134140
)
135141

136142
package_metrics = {"packages": {}}
@@ -203,6 +209,8 @@ def generate_metrics_when_ready(
203209
to_date: datetime.datetime | None = None,
204210
verify_certs: bool = False,
205211
timeout: int = 3600,
212+
code_file_pattern: str | None = None,
213+
binary_file_pattern: str | None = None,
206214
) -> dict[str:Any]:
207215
"""Generate metrics once the repositories have finished the collection.
208216
@@ -214,6 +222,8 @@ def generate_metrics_when_ready(
214222
:param to_date: End date for metrics.
215223
:param verify_certs: Verify SSL/TLS certificates.
216224
:param timeout: Seconds to wait before failing getting metrics
225+
:param code_file_pattern: Regular expression to match code file types.
226+
:param binary_file_pattern: Regular expression to match binary file types.
217227
"""
218228
logging.info("Generating metrics")
219229

@@ -228,12 +238,14 @@ def generate_metrics_when_ready(
228238
for repository in pending_repositories:
229239
if repository_ready(grimoirelab_client, repository, after_date):
230240
metrics["repositories"][repository] = get_repository_metrics(
231-
repository,
232-
opensearch_url,
233-
opensearch_index,
234-
from_date,
235-
to_date,
236-
verify_certs,
241+
repository=repository,
242+
opensearch_url=opensearch_url,
243+
opensearch_index=opensearch_index,
244+
from_date=from_date,
245+
to_date=to_date,
246+
verify_certs=verify_certs,
247+
code_file_pattern=code_file_pattern,
248+
binary_file_pattern=binary_file_pattern,
237249
)
238250
processed.add(repository)
239251

trustable_cli/metrics.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,16 @@
5151

5252

5353
class GitEventsAnalyzer:
54-
def __init__(self):
54+
def __init__(self, code_file_pattern: str | None = None, binary_file_pattern: str | None = None):
5555
self.total_commits: int = 0
5656
self.contributors: Counter = Counter()
5757
self.companies: Counter = Counter()
58-
self.file_types: dict = Counter()
58+
self.file_types: dict = {"code": 0, "binary": 0, "other": 0}
5959
self.added_lines: int = 0
6060
self.removed_lines: int = 0
6161
self.messages_sizes: list = []
62+
self.re_code_pattern = re.compile(code_file_pattern or FILE_TYPE_CODE)
63+
self.re_binary_pattern = re.compile(binary_file_pattern or FILE_TYPE_BINARY)
6264

6365
def process_events(self, events: iter(dict[str, Any])):
6466
for event in events:
@@ -116,7 +118,7 @@ def get_elephant_factor(self):
116118
def get_file_type_metrics(self):
117119
"""Get the file type metrics"""
118120

119-
return dict(self.file_types)
121+
return self.file_types
120122

121123
def get_commit_size_metrics(self):
122124
"""Get the commit size metrics"""
@@ -195,9 +197,9 @@ def _update_file_metrics(self, event):
195197
if not file["file"]:
196198
continue
197199
# File type metrics
198-
if re.search(FILE_TYPE_CODE, file["file"]):
200+
if self.re_code_pattern.search(file["file"]):
199201
self.file_types["code"] += 1
200-
elif re.search(FILE_TYPE_BINARY, file["file"]):
202+
elif self.re_binary_pattern.search(file["file"]):
201203
self.file_types["binary"] += 1
202204
else:
203205
self.file_types["other"] += 1
@@ -226,6 +228,8 @@ def get_repository_metrics(
226228
from_date: datetime.datetime = None,
227229
to_date: datetime.datetime = None,
228230
verify_certs: bool = True,
231+
code_file_pattern: str | None = None,
232+
binary_file_pattern: str | None = None,
229233
):
230234
"""
231235
Get the metrics from a repository.
@@ -236,14 +240,16 @@ def get_repository_metrics(
236240
:param verify_certs: Boolean, verify SSL/TLS certificates, default True
237241
:param from_date: Start date, by default None
238242
:param to_date: End date, by default None
243+
:param code_file_pattern: Regular expression to match code file types.
244+
:param binary_file_pattern: Regular expression to match binary file types.
239245
"""
240246
os_conn = connect_to_opensearch(opensearch_url, verify_certs=verify_certs)
241247

242248
metrics = {"metrics": {}}
243249

244250
events = get_repository_events(os_conn, opensearch_index, repository, from_date, to_date)
245251

246-
analyzer = GitEventsAnalyzer()
252+
analyzer = GitEventsAnalyzer(code_file_pattern=code_file_pattern, binary_file_pattern=binary_file_pattern)
247253
analyzer.process_events(events)
248254

249255
metrics["metrics"]["total_commits"] = analyzer.get_commit_count()

0 commit comments

Comments
 (0)