Skip to content

Commit 5dc367d

Browse files
committed
feat: add better functionality to secret detection
add the following functionality: - improve regexes for identificating secrets via patterns - add high entropy secret detection - include the key of the secret in the found matches Closes: #209
1 parent 133546b commit 5dc367d

File tree

5 files changed

+100
-46
lines changed

5 files changed

+100
-46
lines changed

prompts/default.yaml

+6-4
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@ default_chat: |
1313
- Pkg 2: [https://www.insight.stacklok.com/report/ecosystem/package_name](https://www.insight.stacklok.com/report/ecosystem/package_name?utm_source=codegate)
1414
- ...
1515
16-
If you are Kodu, also append that when finding malicious packages or leaked secrets:
16+
Also append that when finding malicious packages or leaked secrets:
1717
18-
"** critical vulnerability found, you must take action **
19-
Please notify the user with ask follow up tool about the following vulnerability:
20-
<vulnerability_content>"
18+
"** critical vulnerability found, you must take action **"
19+
20+
If you are Kodu, also append:
21+
22+
"Please notify the user with ask follow up tool about the following vulnerability: <vulnerability_content>"
2123
2224
If package_name contains any special chars, please encode them using URL encoding.
2325

signatures.yaml

+10-8
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
- API Key: \bSK[0-9a-fA-F]{32}\b
3333
- Access Token: \b(?i)55[0-9a-fA-F]{32}\b
3434

35+
- Twitter:
36+
- Client ID: (?i)twitter(.{0,20})?['\"][0-9a-z]{18,25}
37+
- Secret: (?i)twitter(.{0,20})?['\"][0-9a-z]{35,44}
38+
3539
- Atlassian:
3640
- JIRA Token: (?i)(jira_token)
3741
- Bitbucket Data Center Access Token: BBDC-[a-zA-Z0-9+]{44}
@@ -40,6 +44,7 @@
4044

4145
- Google:
4246
- Cloud API Key: AIza[0-9A-Za-z_-]{35}
47+
- Cloud Platform API Key: (?i)(google|gcp|youtube|drive|yt)(.{0,20})?['\"][AIza[0-9a-z\\-_]{35}]['\"]
4348
- Cloud OAuth Secret: (?i)(GOCSPX-[-0-9A-Za-z_]{24,32})
4449
#- reCaptcha Key: 6L([A-Za-z0-9_-]{6})AAAAA([A-Za-z0-9_-]{27})
4550
- OAuth Key: ya29\.[0-9A-Za-z_-]{64,256}
@@ -98,6 +103,8 @@
98103
- Meta:
99104
- Page Access Token: (?i)(EAAG[0-9A-Za-z]{10,128})
100105
- Facebook Access Token: EAACEdEose0cBA[0-9A-Za-z]+
106+
- Facebook Client ID: (?i)(facebook|fb)(.{0,20})?['\"][0-9]{13,17}
107+
- Facebook Secret Key: (?i)(facebook|fb)(.{0,20})?(?-i)['\"][0-9a-f]{32}
101108
#- Client Token: (?i)fb[a-zA-Z0-9]{24,32}
102109
- Instagram Access Token: (?i)(IGQV[0-9A-Za-z-_]{10,255})
103110
- Instagram App Secret: (?i)(ig_[a-f0-9]{32})
@@ -192,6 +199,7 @@
192199

193200
- Artifactory:
194201
- Token: AKCp[0-9][a-zA-Z0-9]{64,128}
202+
- Password: AP[\dABCDEF][a-zA-Z0-9]{8,}
195203

196204
- Figma:
197205
- Personal Access Token: (figd_[a-zA-Z0-9-_]{14,32}_[a-zA-Z0-9-_]{14,32})
@@ -265,13 +273,6 @@
265273
- Postgresql:
266274
- URL: (?i)(?:pgsql:|postgres:|postgresql:)//[\S]{1,256}:[\S]{1,256}@[-.%\w\/:]+\.[\S]+
267275

268-
- GitHub:
269-
- Access Token: (?i)\bghp_[A-Za-z0-9]{36}\b
270-
- OAuth Token: (?i)\bgho_[A-Za-z0-9]{36}\b
271-
- App Installation Token: (?i)\bghu_[A-Za-z0-9]{36}\b
272-
- App user Token: (?i)\bghs_[A-Za-z0-9]{36}\b
273-
- Refresh Token: (?i)\bghr_[A-Za-z0-9]{36}\b
274-
275276
- Addresses:
276277
- Bitcoin Legacy: \b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b
277278
- Bitcoin SegWit: \b(bc1)[a-zA-HJ-NP-Z0-9]{39,59}\b
@@ -299,7 +300,8 @@
299300
- Advanced Message Queuing Protocol (AMQP) URL: amqp://[a-zA-Z0-9-_+.@]+:[^@]+@[^/]+
300301
# Private Keys
301302
- JSON Web Key Block: /^(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?$/gm
302-
- Private Key Block: -{0,5} ?BEGIN (?:RSA |ENCRYPTED |OPENSSH |SSH2 )?PRIVATE KEY ?-{0,5} ?([\s\S]*?)-{0,5} ?END (?:RSA |ENCRYPTED |OPENSSH |SSH2 )?PRIVATE KEY ?-{0,5}
303+
- Private Key Block: -{0,5} ?BEGIN (?:RSA |ENCRYPTED |OPENSSH |SSH2 |DSA |EC )?PRIVATE KEY ?-{0,5} ?([\s\S]*?)-{0,5} ?END (?:RSA |ENCRYPTED |OPENSSH |SSH2 |DSA |EC )?PRIVATE KEY ?-{0,5}
304+
- PGP: -{0,5}BEGIN PGP PRIVATE KEY BLOCK-{0,5}[\s\S]*?-{0,5}END PGP PRIVATE KEY BLOCK-{0,5}
303305
- Bitcoin Private Key: \b[5KL][1-9A-HJ-NP-Za-km-z]{50,51}\b
304306
- Ethereum Private Key: \b0x[a-fA-F0-9]{64}\b
305307
- Litecoin Private Key: \b[5KL][1-9A-HJ-NP-Za-km-z]{50,51}\b

src/codegate/pipeline/secrets/secrets.py

+2
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def obfuscate(self, text: str) -> tuple[str, List[Match]]:
147147
logger.info(
148148
f"\nService: {match.service}"
149149
f"\nType: {match.type}"
150+
f"\nKey: {match.key}"
150151
f"\nOriginal: {match.value}"
151152
f"\nEncrypted: {hidden_secret}"
152153
)
@@ -450,6 +451,7 @@ async def process_chunk(
450451
or input_context.metadata.get("redacted_secrets_count", 0) == 0
451452
):
452453
return [chunk]
454+
453455
tool_name = next(
454456
(
455457
tool.lower()

src/codegate/pipeline/secrets/signatures.py

+78-34
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# signatures.py
2+
import math
23
import re
34
from pathlib import Path
45
from threading import Lock
@@ -15,6 +16,7 @@ class Match(NamedTuple):
1516

1617
service: str
1718
type: str
19+
key: str
1820
value: str
1921
line_number: int
2022
start_index: int
@@ -42,6 +44,16 @@ class CodegateSignatures:
4244
_signature_groups: ClassVar[List[SignatureGroup]] = []
4345
_compiled_regexes: ClassVar[Dict[str, re.Pattern]] = {}
4446
_yaml_path: ClassVar[Optional[str]] = None
47+
HIGH_ENTROPY_THRESHOLD: ClassVar[float] = 4.0
48+
49+
@classmethod
50+
def _calculate_entropy(cls, text: str) -> float:
51+
"""Calculate Shannon entropy for a given string."""
52+
if not text:
53+
return 0.0
54+
55+
prob = {char: text.count(char) / len(text) for char in set(text)}
56+
return -sum(p * math.log2(p) for p in prob.values())
4557

4658
@classmethod
4759
def reset(cls) -> None:
@@ -180,22 +192,11 @@ def _load_signatures(cls) -> None:
180192
# Clear existing signatures before loading new ones
181193
cls._signature_groups = []
182194
cls._compiled_regexes = {}
183-
184195
yaml_data = cls._load_yaml(cls._yaml_path)
185196

186-
# Add custom GitHub token patterns
187-
github_patterns = {
188-
"Access Token": r"ghp_[0-9a-zA-Z]{32}",
189-
"Personal Token": r"github_pat_[a-zA-Z0-9]{22}_[a-zA-Z0-9]{59}",
190-
}
191-
cls._add_signature_group("GitHub", github_patterns)
192-
193197
# Process patterns from YAML
194198
for item in yaml_data:
195199
for service_name, patterns in item.items():
196-
if service_name == "GitHub":
197-
continue
198-
199200
service_patterns = {}
200201
for pattern_dict in patterns:
201202
for pattern_name, pattern in pattern_dict.items():
@@ -224,6 +225,7 @@ def find_in_string(cls, text: Union[str, List[str]]) -> List[Match]:
224225
raise RuntimeError("SecretFinder not initialized.")
225226

226227
matches = []
228+
found_values = set()
227229

228230
# Split text into lines for processing
229231
try:
@@ -233,32 +235,74 @@ def find_in_string(cls, text: Union[str, List[str]]) -> List[Match]:
233235
return []
234236

235237
for line_num, line in enumerate(lines, start=1):
236-
for group in cls._signature_groups:
237-
for pattern_name in group.patterns:
238-
regex_key = f"{group.name}:{pattern_name}"
239-
regex = cls._compiled_regexes.get(regex_key)
238+
matches.extend(cls._find_regex_matches(line, line_num, found_values))
239+
matches.extend(cls._find_high_entropy_matches(line, line_num, found_values))
240+
return matches
240241

241-
if not regex:
242+
@classmethod
243+
def _find_regex_matches(cls, line: str, line_num: int, found_values: set) -> List[Match]:
244+
"""Find matches using regex patterns."""
245+
matches = []
246+
for group in cls._signature_groups:
247+
for pattern_name, regex in group.patterns.items():
248+
regex_key = f"{group.name}:{pattern_name}"
249+
regex = cls._compiled_regexes.get(regex_key)
250+
if not regex:
251+
continue
252+
for match in regex.finditer(line):
253+
value = match.group()
254+
key = cls._extract_key_from_line(line, value)
255+
pattern = f"{key}:{value}"
256+
if value.lower() == "token" or pattern in found_values:
242257
continue
258+
found_values.add(pattern)
259+
matches.append(
260+
Match(
261+
group.name,
262+
pattern_name,
263+
key,
264+
value,
265+
line_num,
266+
match.start(),
267+
match.end(),
268+
)
269+
)
270+
return matches
243271

244-
try:
245-
for match in regex.finditer(line):
246-
value = match.group()
247-
if value.lower() == "token":
248-
continue
272+
@staticmethod
273+
def _extract_key_from_line(line: str, secret_value: str) -> Optional[str]:
274+
"""
275+
Extract the key associated with a secret value if it follows a key=value pattern.
276+
"""
277+
match = re.search(
278+
r'([A-Za-z_][A-Za-z0-9_]*)\s*=\s*["\']?' + re.escape(secret_value) + r'["\']?', line
279+
)
280+
return match.group(1) if match else None
249281

250-
matches.append(
251-
Match(
252-
service=group.name,
253-
type=pattern_name,
254-
value=value,
255-
line_number=line_num,
256-
start_index=match.start(),
257-
end_index=match.end(),
258-
)
259-
)
260-
except Exception as e:
261-
logger.warning(f"Error matching pattern {regex_key}: {e}")
262-
continue
282+
@classmethod
283+
def _find_high_entropy_matches(cls, line: str, line_num: int, found_values: set) -> List[Match]:
284+
"""Find matches based on high entropy values."""
285+
matches = []
286+
assignment_pattern = re.findall(
287+
r"([A-Za-z_][A-Za-z0-9_]*)\s*=\s*([\"']?([A-Za-z0-9_\-\.+/=]{8,})[\"']?)", line
288+
)
289+
290+
for key, _, word in assignment_pattern:
291+
pattern = f"{key}:{word}"
292+
if pattern in found_values or word.startswith("REDACTED"):
293+
continue
294+
if cls._calculate_entropy(word) >= cls.HIGH_ENTROPY_THRESHOLD:
295+
found_values.add(pattern)
296+
matches.append(
297+
Match(
298+
"High Entropy",
299+
"Potential Secret",
300+
key,
301+
word,
302+
line_num,
303+
line.find(word),
304+
line.find(word) + len(word),
305+
)
306+
)
263307

264308
return matches

tests/pipeline/secrets/test_secrets.py

+4
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ def test_hide_secret(self):
7878
match = Match(
7979
service="AWS",
8080
type="Access Key",
81+
key="API_KEY",
8182
value="AKIAIOSFODNN7EXAMPLE",
8283
line_number=1,
8384
start_index=0,
@@ -115,6 +116,7 @@ def test_hide_secret(self):
115116
match = Match(
116117
service="AWS",
117118
type="Access Key",
119+
key="API_KEY",
118120
value="AKIAIOSFODNN7EXAMPLE",
119121
line_number=1,
120122
start_index=0,
@@ -129,6 +131,8 @@ def test_obfuscate(self):
129131
# Test text with multiple secrets
130132
text = "API_KEY=AKIAIOSFODNN7EXAMPLE\nPASSWORD=AKIAIOSFODNN7EXAMPLE"
131133
protected, matched_secrets = self.obfuscator.obfuscate(text)
134+
print(protected)
135+
print(matched_secrets)
132136

133137
assert len(matched_secrets) == 2
134138
assert "AKIAIOSFODNN7EXAMPLE" not in protected

0 commit comments

Comments
 (0)