1
1
# signatures.py
2
+ import math
2
3
import re
3
4
from pathlib import Path
4
5
from threading import Lock
@@ -15,6 +16,7 @@ class Match(NamedTuple):
15
16
16
17
service : str
17
18
type : str
19
+ key : str
18
20
value : str
19
21
line_number : int
20
22
start_index : int
@@ -42,6 +44,16 @@ class CodegateSignatures:
42
44
_signature_groups : ClassVar [List [SignatureGroup ]] = []
43
45
_compiled_regexes : ClassVar [Dict [str , re .Pattern ]] = {}
44
46
_yaml_path : ClassVar [Optional [str ]] = None
47
+ HIGH_ENTROPY_THRESHOLD : ClassVar [float ] = 4.0
48
+
49
+ @classmethod
50
+ def _calculate_entropy (cls , text : str ) -> float :
51
+ """Calculate Shannon entropy for a given string."""
52
+ if not text :
53
+ return 0.0
54
+
55
+ prob = {char : text .count (char ) / len (text ) for char in set (text )}
56
+ return - sum (p * math .log2 (p ) for p in prob .values ())
45
57
46
58
@classmethod
47
59
def reset (cls ) -> None :
@@ -180,22 +192,11 @@ def _load_signatures(cls) -> None:
180
192
# Clear existing signatures before loading new ones
181
193
cls ._signature_groups = []
182
194
cls ._compiled_regexes = {}
183
-
184
195
yaml_data = cls ._load_yaml (cls ._yaml_path )
185
196
186
- # Add custom GitHub token patterns
187
- github_patterns = {
188
- "Access Token" : r"ghp_[0-9a-zA-Z]{32}" ,
189
- "Personal Token" : r"github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}" ,
190
- }
191
- cls ._add_signature_group ("GitHub" , github_patterns )
192
-
193
197
# Process patterns from YAML
194
198
for item in yaml_data :
195
199
for service_name , patterns in item .items ():
196
- if service_name == "GitHub" :
197
- continue
198
-
199
200
service_patterns = {}
200
201
for pattern_dict in patterns :
201
202
for pattern_name , pattern in pattern_dict .items ():
@@ -224,6 +225,7 @@ def find_in_string(cls, text: Union[str, List[str]]) -> List[Match]:
224
225
raise RuntimeError ("SecretFinder not initialized." )
225
226
226
227
matches = []
228
+ found_values = set ()
227
229
228
230
# Split text into lines for processing
229
231
try :
@@ -233,32 +235,74 @@ def find_in_string(cls, text: Union[str, List[str]]) -> List[Match]:
233
235
return []
234
236
235
237
for line_num , line in enumerate (lines , start = 1 ):
236
- for group in cls ._signature_groups :
237
- for pattern_name in group .patterns :
238
- regex_key = f"{ group .name } :{ pattern_name } "
239
- regex = cls ._compiled_regexes .get (regex_key )
238
+ matches .extend (cls ._find_regex_matches (line , line_num , found_values ))
239
+ matches .extend (cls ._find_high_entropy_matches (line , line_num , found_values ))
240
+ return matches
240
241
241
- if not regex :
242
+ @classmethod
243
+ def _find_regex_matches (cls , line : str , line_num : int , found_values : set ) -> List [Match ]:
244
+ """Find matches using regex patterns."""
245
+ matches = []
246
+ for group in cls ._signature_groups :
247
+ for pattern_name , regex in group .patterns .items ():
248
+ regex_key = f"{ group .name } :{ pattern_name } "
249
+ regex = cls ._compiled_regexes .get (regex_key )
250
+ if not regex :
251
+ continue
252
+ for match in regex .finditer (line ):
253
+ value = match .group ()
254
+ key = cls ._extract_key_from_line (line , value )
255
+ pattern = f"{ key } :{ value } "
256
+ if value .lower () == "token" or pattern in found_values :
242
257
continue
258
+ found_values .add (pattern )
259
+ matches .append (
260
+ Match (
261
+ group .name ,
262
+ pattern_name ,
263
+ key ,
264
+ value ,
265
+ line_num ,
266
+ match .start (),
267
+ match .end (),
268
+ )
269
+ )
270
+ return matches
243
271
244
- try :
245
- for match in regex .finditer (line ):
246
- value = match .group ()
247
- if value .lower () == "token" :
248
- continue
272
+ @staticmethod
273
+ def _extract_key_from_line (line : str , secret_value : str ) -> Optional [str ]:
274
+ """
275
+ Extract the key associated with a secret value if it follows a key=value pattern.
276
+ """
277
+ match = re .search (
278
+ r'([A-Za-z_][A-Za-z0-9_]*)\s*=\s*["\']?' + re .escape (secret_value ) + r'["\']?' , line
279
+ )
280
+ return match .group (1 ) if match else None
249
281
250
- matches .append (
251
- Match (
252
- service = group .name ,
253
- type = pattern_name ,
254
- value = value ,
255
- line_number = line_num ,
256
- start_index = match .start (),
257
- end_index = match .end (),
258
- )
259
- )
260
- except Exception as e :
261
- logger .warning (f"Error matching pattern { regex_key } : { e } " )
262
- continue
282
+ @classmethod
283
+ def _find_high_entropy_matches (cls , line : str , line_num : int , found_values : set ) -> List [Match ]:
284
+ """Find matches based on high entropy values."""
285
+ matches = []
286
+ assignment_pattern = re .findall (
287
+ r"([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([\"']?([A-Za-z0-9_\-\.+/=]{8,})[\"']?)" , line
288
+ )
289
+
290
+ for key , _ , word in assignment_pattern :
291
+ pattern = f"{ key } :{ word } "
292
+ if pattern in found_values or word .startswith ("REDACTED" ):
293
+ continue
294
+ if cls ._calculate_entropy (word ) >= cls .HIGH_ENTROPY_THRESHOLD :
295
+ found_values .add (pattern )
296
+ matches .append (
297
+ Match (
298
+ "High Entropy" ,
299
+ "Potential Secret" ,
300
+ key ,
301
+ word ,
302
+ line_num ,
303
+ line .find (word ),
304
+ line .find (word ) + len (word ),
305
+ )
306
+ )
263
307
264
308
return matches
0 commit comments