Skip to content

Commit fc2eacc

Browse files
author
azazelm3dj3d
committed
Allows the user to remove end punctuation regex
1 parent 4100a2d commit fc2eacc

File tree

2 files changed

+71
-31
lines changed

2 files changed

+71
-31
lines changed

iocextract.py

Lines changed: 70 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -39,36 +39,72 @@
3939
# Checks for whitespace and trailing characters after the URL
4040
WS_SYNTAX_RM = re.compile(r"\s+/[a-zA-Z]")
4141

42-
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
43-
GENERIC_URL_RE = re.compile(r"""
44-
(
45-
# Scheme.
46-
[fhstu]\S\S?[px]s?
42+
def url_re(open_end=False):
4743

48-
# One of these delimiters/defangs.
49-
(?:
50-
:\/\/|
51-
:\\\\|
52-
\[:\]\/\/|
53-
:?__
54-
)
44+
if open_end:
45+
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
46+
GENERIC_URL_RE = re.compile(r"""
47+
(
48+
# Scheme.
49+
[fhstu]\S\S?[px]s?
5550
56-
# Any number of defang characters.
57-
(?:
58-
\x20|
59-
""" + SEPARATOR_DEFANGS + r"""
60-
)*
51+
# One of these delimiters/defangs.
52+
(?:
53+
:\/\/|
54+
:\\\\|
55+
\[:\]\/\/|
56+
:?__
57+
)
6158
62-
# Domain/path characters.
63-
\w
64-
\S+?
59+
# Any number of defang characters.
60+
(?:
61+
\x20|
62+
""" + SEPARATOR_DEFANGS + r"""
63+
)*
6564
66-
# CISCO ESA style defangs followed by domain/path characters.
67-
(?:\x20[\/\.][^\.\/\s]\S*?)*
68-
)
69-
""" + END_PUNCTUATION + r"""
70-
(?=\s|[^\x00-\x7F]|$)
71-
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
65+
# Domain/path characters.
66+
\w
67+
\S+?
68+
69+
# CISCO ESA style defangs followed by domain/path characters.
70+
(?:\x20[\/\.][^\.\/\s]\S*?)*
71+
)
72+
""" + r"""
73+
(?=\s|[^\x00-\x7F]|$)
74+
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
75+
else:
76+
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
77+
GENERIC_URL_RE = re.compile(r"""
78+
(
79+
# Scheme.
80+
[fhstu]\S\S?[px]s?
81+
82+
# One of these delimiters/defangs.
83+
(?:
84+
:\/\/|
85+
:\\\\|
86+
\[:\]\/\/|
87+
:?__
88+
)
89+
90+
# Any number of defang characters.
91+
(?:
92+
\x20|
93+
""" + SEPARATOR_DEFANGS + r"""
94+
)*
95+
96+
# Domain/path characters.
97+
\w
98+
\S+?
99+
100+
# CISCO ESA style defangs followed by domain/path characters.
101+
(?:\x20[\/\.][^\.\/\s]\S*?)*
102+
)
103+
""" + END_PUNCTUATION + r"""
104+
(?=\s|[^\x00-\x7F]|$)
105+
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
106+
107+
return GENERIC_URL_RE
72108

73109
# Get some obfuscated urls, main anchor is brackets around the period.
74110
BRACKET_URL_RE = re.compile(r"""
@@ -256,7 +292,7 @@ def extract_iocs(data, refang=False, strip=False):
256292
)
257293

258294

259-
def extract_urls(data, refang=False, strip=False, delimiter=None):
295+
def extract_urls(data, refang=False, strip=False, delimiter=None, open_punc=False):
260296
"""Extract URLs.
261297
262298
:param data: Input text
@@ -265,21 +301,22 @@ def extract_urls(data, refang=False, strip=False, delimiter=None):
265301
:rtype: :py:func:`itertools.chain`
266302
"""
267303
return itertools.chain(
268-
extract_unencoded_urls(data, refang=refang, strip=strip),
304+
extract_unencoded_urls(data, refang=refang, strip=strip, open_punc=open_punc),
269305
extract_encoded_urls(data, refang=refang, strip=strip, delimiter=delimiter),
270306
)
271307

272308

273-
def extract_unencoded_urls(data, refang=False, strip=False):
309+
def extract_unencoded_urls(data, refang=False, strip=False, open_punc=False):
274310
"""Extract only unencoded URLs.
275311
276312
:param data: Input text
277313
:param bool refang: Refang output?
278314
:param bool strip: Strip possible garbage from the end of URLs
279315
:rtype: Iterator[:class:`str`]
280316
"""
317+
281318
unencoded_urls = itertools.chain(
282-
GENERIC_URL_RE.finditer(data),
319+
url_re(open_punc).finditer(data),
283320
BRACKET_URL_RE.finditer(data),
284321
BACKSLASH_URL_RE.finditer(data),
285322
)
@@ -728,6 +765,7 @@ def main():
728765
parser.add_argument('--wide', action='store_true',
729766
help="preprocess input to allow wide-encoded character matches. default: no")
730767
parser.add_argument('--json', action='store_true')
768+
parser.add_argument('--open', action='store_true', help="Removes the end puncuation regex when extracting URLs")
731769

732770
args = parser.parse_args()
733771

@@ -757,6 +795,8 @@ def main():
757795
memo["ipv6s"] = list(extract_ipv6s(data))
758796
if args.extract_urls or extract_all:
759797
memo["urls"] = list(extract_urls(data, refang=args.refang, strip=args.strip_urls))
798+
if args.extract_urls and args.open:
799+
memo["urls"] = list(extract_urls(data, refang=args.refang, strip=args.strip_urls, open_punc=True))
760800
if args.extract_yara_rules or extract_all:
761801
memo["yara_rules"] = list(extract_yara_rules(data))
762802
if args.extract_hashes or extract_all:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name='iocextract',
11-
version='1.13.7',
11+
version='1.13.8',
1212
include_package_data=True,
1313
py_modules=[
1414
'iocextract',

0 commit comments

Comments
 (0)