27
27
28
28
from opensearchpy import OpenSearch , Search
29
29
30
- from grimoirelab_toolkit .datetime import str_to_datetime
30
+ from grimoirelab_toolkit .datetime import (
31
+ str_to_datetime ,
32
+ datetime_utcnow ,
33
+ datetime_to_utc ,
34
+ InvalidDateError ,
35
+ )
31
36
32
37
logging .getLogger ("opensearch" ).setLevel (logging .WARNING )
33
38
52
57
class GitEventsAnalyzer :
53
58
def __init__ (
54
59
self ,
60
+ from_date : datetime .datetime | None = None ,
61
+ to_date : datetime .datetime | None = None ,
55
62
code_file_pattern : str | None = None ,
56
63
binary_file_pattern : str | None = None ,
57
64
pony_threshold : float = 0.5 ,
58
65
elephant_threshold : float = 0.5 ,
59
66
dev_categories_thresholds : tuple [float , float ] = (0.8 , 0.95 ),
60
67
):
68
+ # Define the default dates if not provided
69
+ if from_date :
70
+ self .from_date = datetime_to_utc (from_date )
71
+ else :
72
+ self .from_date = datetime_utcnow () - datetime .timedelta (days = 365 )
73
+ if to_date :
74
+ self .to_date = datetime_to_utc (to_date )
75
+ else :
76
+ self .to_date = datetime_utcnow ()
77
+
61
78
self .total_commits : int = 0
79
+ self .recent_commits : int = 0
62
80
self .contributors : Counter = Counter ()
63
- self .companies : Counter = Counter ()
81
+ self .contributors_growth : dict [str , set ] = {"first_half" : set (), "second_half" : set ()}
82
+ self .organizations : Counter = Counter ()
83
+ self .recent_organizations : set = set ()
84
+ self .recent_contributors : set = set ()
64
85
self .file_types : dict = {"code" : 0 , "binary" : 0 , "other" : 0 }
65
86
self .added_lines : int = 0
66
87
self .removed_lines : int = 0
@@ -74,6 +95,8 @@ def __init__(
74
95
self .last_commit : str | None = None
75
96
self .first_commit_date : datetime .datetime | None = None
76
97
self .last_commit_date : datetime .datetime | None = None
98
+ self .active_branches : set = set ()
99
+ self ._half_period = self .from_date + (self .to_date - self .from_date ) / 2
77
100
78
101
def process_events (self , events : iter (dict [str , Any ])):
79
102
for event in events :
@@ -82,9 +105,10 @@ def process_events(self, events: iter(dict[str, Any])):
82
105
83
106
event_data = event .get ("data" )
84
107
85
- self .total_commits += 1
86
- self .contributors [event_data [AUTHOR_FIELD ]] += 1
87
- self ._update_companies (event_data )
108
+ self ._update_commit_count (event_data )
109
+ self ._update_branches (event_data )
110
+ self ._update_contributors (event_data )
111
+ self ._update_organizations (event_data )
88
112
self ._update_file_metrics (event_data )
89
113
self ._update_message_size_metrics (event_data )
90
114
self ._update_first_and_last_commit (event_data )
@@ -95,6 +119,9 @@ def get_commit_count(self):
95
119
def get_contributor_count (self ):
96
120
return len (self .contributors )
97
121
122
+ def get_organization_count (self ):
123
+ return len (self .organizations )
124
+
98
125
def get_pony_factor (self ):
99
126
"""Number of individuals producing up to 50% of the total number of code contributions"""
100
127
@@ -113,15 +140,15 @@ def get_pony_factor(self):
113
140
return pony_factor
114
141
115
142
def get_elephant_factor (self ):
116
- """Number of companies producing up to 50% of the total number of code contributions"""
143
+ """Number of organizations producing up to 50% of the total number of code contributions"""
117
144
118
145
partial_contributions = 0
119
146
elephant_factor = 0
120
147
121
- if len (self .companies ) == 0 :
148
+ if len (self .organizations ) == 0 :
122
149
return 0
123
150
124
- for _ , contributions in self .companies .most_common ():
151
+ for _ , contributions in self .organizations .most_common ():
125
152
partial_contributions += contributions
126
153
elephant_factor += 1
127
154
if partial_contributions / self .total_commits > self .elephant_threshold :
@@ -209,6 +236,48 @@ def get_developer_categories(self):
209
236
"casual" : casual ,
210
237
}
211
238
239
+ def get_recent_organizations (self ):
240
+ """Return the number of recent organizations."""
241
+
242
+ return len (self .recent_organizations )
243
+
244
+ def get_recent_contributors (self ):
245
+ """Return the number of contributors from the last 90d."""
246
+
247
+ return len (self .recent_contributors )
248
+
249
+ def get_recent_commits (self ) -> int :
250
+ """Return the number of commits in the last 90d."""
251
+
252
+ return self .recent_commits
253
+
254
+ def get_growth_of_contributors (self ):
255
+ """Return the growth of contributors by period."""
256
+
257
+ first_half = len (self .contributors_growth ["first_half" ])
258
+ second_half = len (self .contributors_growth ["second_half" ])
259
+
260
+ return second_half - first_half
261
+
262
+ def get_growth_rate_of_contributors (self ):
263
+ """Return the growth of contributors by period."""
264
+
265
+ first_half = len (self .contributors_growth ["first_half" ])
266
+ second_half = len (self .contributors_growth ["second_half" ])
267
+
268
+ if first_half == 0 and second_half == 0 :
269
+ return 0
270
+ elif first_half == 0 and second_half != 0 :
271
+ # It increased infinitely
272
+ return second_half
273
+ else :
274
+ return (second_half - first_half ) / first_half
275
+
276
+ def get_active_branch_count (self ):
277
+ """Return the number of active branches."""
278
+
279
+ return len (self .active_branches )
280
+
212
281
def get_analysis_metadata (self ):
213
282
"""Return metadata about the analysis."""
214
283
@@ -226,13 +295,78 @@ def get_analysis_metadata(self):
226
295
227
296
return metadata
228
297
229
- def _update_companies (self , event ):
298
+ def get_days_since_last_commit (self ):
299
+ """Return the number of days since the last commit."""
300
+
301
+ if not self .last_commit_date :
302
+ return None
303
+
304
+ days_since_last_commit = (self .to_date - self .last_commit_date ).days
305
+
306
+ return days_since_last_commit
307
+
308
+ def _update_commit_count (self , event_data ):
309
+ """Update the commit count and commits by period."""
310
+
311
+ # Update total commits
312
+ self .total_commits += 1
313
+
314
+ # Update commits by period
230
315
try :
231
- author = event [AUTHOR_FIELD ]
232
- company = author .split ("@" )[1 ][:- 1 ]
233
- self .companies [company ] += 1
316
+ commit_date = str_to_datetime (event_data .get ("CommitDate" ))
317
+ days_interval = (self .to_date - commit_date ).days
318
+ except (ValueError , TypeError , InvalidDateError ):
319
+ return
320
+
321
+ if days_interval <= 90 :
322
+ self .recent_commits += 1
323
+
324
+ def _update_contributors (self , event_data ):
325
+ author = event_data [AUTHOR_FIELD ]
326
+
327
+ self .contributors [author ] += 1
328
+
329
+ # Update contributor growth
330
+ try :
331
+ commit_date = event_data .get ("CommitDate" )
332
+ commit_date = str_to_datetime (commit_date )
333
+ except (ValueError , TypeError , InvalidDateError ):
334
+ commit_date = None
335
+
336
+ if commit_date and self ._half_period :
337
+ if commit_date < self ._half_period :
338
+ self .contributors_growth ["first_half" ].add (author )
339
+ else :
340
+ self .contributors_growth ["second_half" ].add (author )
341
+
342
+ # Update contributors by period
343
+ try :
344
+ commit_date = str_to_datetime (event_data .get ("CommitDate" ))
345
+ days_interval = (self .to_date - commit_date ).days
346
+ except (ValueError , TypeError , InvalidDateError ):
347
+ pass
348
+ else :
349
+ if days_interval <= 90 :
350
+ self .recent_contributors .add (author )
351
+
352
+ def _update_organizations (self , event_data ):
353
+ try :
354
+ author = event_data [AUTHOR_FIELD ]
355
+ organization = author .split ("@" )[1 ][:- 1 ]
234
356
except (IndexError , KeyError ):
357
+ return
358
+
359
+ self .organizations [organization ] += 1
360
+
361
+ # Update organizations by period
362
+ try :
363
+ commit_date = str_to_datetime (event_data .get ("CommitDate" ))
364
+ days_interval = (self .to_date - commit_date ).days
365
+ except (ValueError , TypeError , InvalidDateError ):
235
366
pass
367
+ else :
368
+ if days_interval <= 90 :
369
+ self .recent_organizations .add (organization )
236
370
237
371
def _update_file_metrics (self , event ):
238
372
if "files" not in event :
@@ -241,6 +375,7 @@ def _update_file_metrics(self, event):
241
375
for file in event ["files" ]:
242
376
if not file ["file" ]:
243
377
continue
378
+
244
379
# File type metrics
245
380
if self .re_code_pattern .search (file ["file" ]):
246
381
self .file_types ["code" ] += 1
@@ -283,6 +418,19 @@ def _update_first_and_last_commit(self, event):
283
418
self .last_commit = commit
284
419
self .last_commit_date = commit_date
285
420
421
+ def _update_branches (self , event_data ):
422
+ """Identify the refs that are branches and update the active branches."""
423
+
424
+ if "refs" not in event_data :
425
+ return
426
+
427
+ for ref in event_data ["refs" ]:
428
+ if "refs/heads/" not in ref :
429
+ continue
430
+
431
+ branch_name = ref .split ("refs/heads/" )[1 ]
432
+ self .active_branches .add (branch_name )
433
+
286
434
287
435
def get_repository_metrics (
288
436
repository : str ,
@@ -331,6 +479,8 @@ def get_repository_metrics(
331
479
events = get_repository_events (os_conn , opensearch_index , repository , from_date , to_date )
332
480
333
481
analyzer = GitEventsAnalyzer (
482
+ from_date = from_date ,
483
+ to_date = to_date ,
334
484
code_file_pattern = code_file_pattern ,
335
485
binary_file_pattern = binary_file_pattern ,
336
486
pony_threshold = pony_threshold ,
@@ -341,8 +491,16 @@ def get_repository_metrics(
341
491
342
492
metrics ["metrics" ]["total_commits" ] = analyzer .get_commit_count ()
343
493
metrics ["metrics" ]["total_contributors" ] = analyzer .get_contributor_count ()
494
+ metrics ["metrics" ]["total_organizations" ] = analyzer .get_organization_count ()
344
495
metrics ["metrics" ]["pony_factor" ] = analyzer .get_pony_factor ()
345
496
metrics ["metrics" ]["elephant_factor" ] = analyzer .get_elephant_factor ()
497
+ metrics ["metrics" ]["recent_organizations" ] = analyzer .get_recent_organizations ()
498
+ metrics ["metrics" ]["recent_contributors" ] = analyzer .get_recent_contributors ()
499
+ metrics ["metrics" ]["recent_commits" ] = analyzer .get_recent_commits ()
500
+ metrics ["metrics" ]["contributor_growth" ] = analyzer .get_growth_of_contributors ()
501
+ metrics ["metrics" ]["contributor_growth_rate" ] = analyzer .get_growth_rate_of_contributors ()
502
+ metrics ["metrics" ]["active_branches" ] = analyzer .get_active_branch_count ()
503
+ metrics ["metrics" ]["days_since_last_commit" ] = analyzer .get_days_since_last_commit ()
346
504
347
505
if from_date and to_date :
348
506
days = (to_date - from_date ).days
0 commit comments