Skip to content

Commit 00ba370

Browse files
author
azazelm3dj3d
authored
Merge pull request #57 from InQuest/dev
General improvements
2 parents 13721d1 + fc2eacc commit 00ba370

File tree

2 files changed

+77
-35
lines changed

2 files changed

+77
-35
lines changed

iocextract.py

Lines changed: 76 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -39,36 +39,72 @@
3939
# Checks for whitespace and trailing characters after the URL
4040
WS_SYNTAX_RM = re.compile(r"\s+/[a-zA-Z]")
4141

42-
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
43-
GENERIC_URL_RE = re.compile(r"""
44-
(
45-
# Scheme.
46-
[fhstu]\S\S?[px]s?
42+
def url_re(open_end=False):
4743

48-
# One of these delimiters/defangs.
49-
(?:
50-
:\/\/|
51-
:\\\\|
52-
\[:\]\/\/|
53-
:?__
54-
)
44+
if open_end:
45+
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
46+
GENERIC_URL_RE = re.compile(r"""
47+
(
48+
# Scheme.
49+
[fhstu]\S\S?[px]s?
5550
56-
# Any number of defang characters.
57-
(?:
58-
\x20|
59-
""" + SEPARATOR_DEFANGS + r"""
60-
)*
51+
# One of these delimiters/defangs.
52+
(?:
53+
:\/\/|
54+
:\\\\|
55+
\[:\]\/\/|
56+
:?__
57+
)
58+
59+
# Any number of defang characters.
60+
(?:
61+
\x20|
62+
""" + SEPARATOR_DEFANGS + r"""
63+
)*
6164
62-
# Domain/path characters.
63-
\w
64-
\S+?
65+
# Domain/path characters.
66+
\w
67+
\S+?
6568
66-
# CISCO ESA style defangs followed by domain/path characters.
67-
(?:\x20[\/\.][^\.\/\s]\S*?)*
68-
)
69-
""" + END_PUNCTUATION + r"""
70-
(?=\s|[^\x00-\x7F]|$)
71-
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
69+
# CISCO ESA style defangs followed by domain/path characters.
70+
(?:\x20[\/\.][^\.\/\s]\S*?)*
71+
)
72+
""" + r"""
73+
(?=\s|[^\x00-\x7F]|$)
74+
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
75+
else:
76+
# Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
77+
GENERIC_URL_RE = re.compile(r"""
78+
(
79+
# Scheme.
80+
[fhstu]\S\S?[px]s?
81+
82+
# One of these delimiters/defangs.
83+
(?:
84+
:\/\/|
85+
:\\\\|
86+
\[:\]\/\/|
87+
:?__
88+
)
89+
90+
# Any number of defang characters.
91+
(?:
92+
\x20|
93+
""" + SEPARATOR_DEFANGS + r"""
94+
)*
95+
96+
# Domain/path characters.
97+
\w
98+
\S+?
99+
100+
# CISCO ESA style defangs followed by domain/path characters.
101+
(?:\x20[\/\.][^\.\/\s]\S*?)*
102+
)
103+
""" + END_PUNCTUATION + r"""
104+
(?=\s|[^\x00-\x7F]|$)
105+
""", re.IGNORECASE | re.VERBOSE | re.UNICODE)
106+
107+
return GENERIC_URL_RE
72108

73109
# Get some obfuscated urls, main anchor is brackets around the period.
74110
BRACKET_URL_RE = re.compile(r"""
@@ -256,7 +292,7 @@ def extract_iocs(data, refang=False, strip=False):
256292
)
257293

258294

259-
def extract_urls(data, refang=False, strip=False):
295+
def extract_urls(data, refang=False, strip=False, delimiter=None, open_punc=False):
260296
"""Extract URLs.
261297
262298
:param data: Input text
@@ -265,21 +301,22 @@ def extract_urls(data, refang=False, strip=False):
265301
:rtype: :py:func:`itertools.chain`
266302
"""
267303
return itertools.chain(
268-
extract_unencoded_urls(data, refang=refang, strip=strip),
269-
extract_encoded_urls(data, refang=refang, strip=strip),
304+
extract_unencoded_urls(data, refang=refang, strip=strip, open_punc=open_punc),
305+
extract_encoded_urls(data, refang=refang, strip=strip, delimiter=delimiter),
270306
)
271307

272308

273-
def extract_unencoded_urls(data, refang=False, strip=False):
309+
def extract_unencoded_urls(data, refang=False, strip=False, open_punc=False):
274310
"""Extract only unencoded URLs.
275311
276312
:param data: Input text
277313
:param bool refang: Refang output?
278314
:param bool strip: Strip possible garbage from the end of URLs
279315
:rtype: Iterator[:class:`str`]
280316
"""
317+
281318
unencoded_urls = itertools.chain(
282-
GENERIC_URL_RE.finditer(data),
319+
url_re(open_punc).finditer(data),
283320
BRACKET_URL_RE.finditer(data),
284321
BACKSLASH_URL_RE.finditer(data),
285322
)
@@ -303,7 +340,7 @@ def found_ws(s):
303340
yield url
304341

305342

306-
def extract_encoded_urls(data, refang=False, strip=False):
343+
def extract_encoded_urls(data, refang=False, strip=False, delimiter=None):
307344
"""Extract only encoded URLs.
308345
309346
:param data: Input text
@@ -340,9 +377,11 @@ def extract_encoded_urls(data, refang=False, strip=False):
340377
# The only valid starts are "http" or "ftp", so look for h/f case insensitive.
341378
url = url[re.search('[hHfF]', url).start():]
342379

380+
if delimiter == "space":
381+
pass
382+
else:
343383
# Stop at the first whitespace or non-unicode character.
344-
url = url.split(u'\ufffd')[0].\
345-
split()[0]
384+
url = url.split(u'\ufffd')[0].split()[0]
346385

347386
if strip:
348387
url = re.split(URL_SPLIT_STR, url)[0]
@@ -726,6 +765,7 @@ def main():
726765
parser.add_argument('--wide', action='store_true',
727766
help="preprocess input to allow wide-encoded character matches. default: no")
728767
parser.add_argument('--json', action='store_true')
768+
parser.add_argument('--open', action='store_true', help="Removes the end puncuation regex when extracting URLs")
729769

730770
args = parser.parse_args()
731771

@@ -755,6 +795,8 @@ def main():
755795
memo["ipv6s"] = list(extract_ipv6s(data))
756796
if args.extract_urls or extract_all:
757797
memo["urls"] = list(extract_urls(data, refang=args.refang, strip=args.strip_urls))
798+
if args.extract_urls and args.open:
799+
memo["urls"] = list(extract_urls(data, refang=args.refang, strip=args.strip_urls, open_punc=True))
758800
if args.extract_yara_rules or extract_all:
759801
memo["yara_rules"] = list(extract_yara_rules(data))
760802
if args.extract_hashes or extract_all:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name='iocextract',
11-
version='1.13.2',
11+
version='1.13.8',
1212
include_package_data=True,
1313
py_modules=[
1414
'iocextract',

0 commit comments

Comments
 (0)