3939# Checks for whitespace and trailing characters after the URL
4040WS_SYNTAX_RM = re .compile (r"\s+/[a-zA-Z]" )
4141
42- # Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
43- GENERIC_URL_RE = re .compile (r"""
44- (
45- # Scheme.
46- [fhstu]\S\S?[px]s?
42+ def url_re (open_end = False ):
4743
48- # One of these delimiters/defangs.
49- (?:
50- :\/\/|
51- :\\\\|
52- \[:\]\/\/|
53- :?__
54- )
44+ if open_end :
45+ # Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
46+ GENERIC_URL_RE = re .compile (r"""
47+ (
48+ # Scheme.
49+ [fhstu]\S\S?[px]s?
5550
56- # Any number of defang characters.
57- (?:
58- \x20|
59- """ + SEPARATOR_DEFANGS + r"""
60- )*
51+ # One of these delimiters/defangs.
52+ (?:
53+ :\/\/|
54+ :\\\\|
55+ \[:\]\/\/|
56+ :?__
57+ )
58+
59+ # Any number of defang characters.
60+ (?:
61+ \x20|
62+ """ + SEPARATOR_DEFANGS + r"""
63+ )*
6164
62- # Domain/path characters.
63- \w
64- \S+?
65+ # Domain/path characters.
66+ \w
67+ \S+?
6568
66- # CISCO ESA style defangs followed by domain/path characters.
67- (?:\x20[\/\.][^\.\/\s]\S*?)*
68- )
69- """ + END_PUNCTUATION + r"""
70- (?=\s|[^\x00-\x7F]|$)
71- """ , re .IGNORECASE | re .VERBOSE | re .UNICODE )
69+ # CISCO ESA style defangs followed by domain/path characters.
70+ (?:\x20[\/\.][^\.\/\s]\S*?)*
71+ )
72+ """ + r"""
73+ (?=\s|[^\x00-\x7F]|$)
74+ """ , re .IGNORECASE | re .VERBOSE | re .UNICODE )
75+ else :
76+ # Get basic url format, including a few obfuscation techniques, main anchor is the uri scheme.
77+ GENERIC_URL_RE = re .compile (r"""
78+ (
79+ # Scheme.
80+ [fhstu]\S\S?[px]s?
81+
82+ # One of these delimiters/defangs.
83+ (?:
84+ :\/\/|
85+ :\\\\|
86+ \[:\]\/\/|
87+ :?__
88+ )
89+
90+ # Any number of defang characters.
91+ (?:
92+ \x20|
93+ """ + SEPARATOR_DEFANGS + r"""
94+ )*
95+
96+ # Domain/path characters.
97+ \w
98+ \S+?
99+
100+ # CISCO ESA style defangs followed by domain/path characters.
101+ (?:\x20[\/\.][^\.\/\s]\S*?)*
102+ )
103+ """ + END_PUNCTUATION + r"""
104+ (?=\s|[^\x00-\x7F]|$)
105+ """ , re .IGNORECASE | re .VERBOSE | re .UNICODE )
106+
107+ return GENERIC_URL_RE
72108
73109# Get some obfuscated urls, main anchor is brackets around the period.
74110BRACKET_URL_RE = re .compile (r"""
@@ -256,7 +292,7 @@ def extract_iocs(data, refang=False, strip=False):
256292 )
257293
258294
259- def extract_urls (data , refang = False , strip = False ):
295+ def extract_urls (data , refang = False , strip = False , delimiter = None , open_punc = False ):
260296 """Extract URLs.
261297
262298 :param data: Input text
@@ -265,21 +301,22 @@ def extract_urls(data, refang=False, strip=False):
265301 :rtype: :py:func:`itertools.chain`
266302 """
267303 return itertools .chain (
268- extract_unencoded_urls (data , refang = refang , strip = strip ),
269- extract_encoded_urls (data , refang = refang , strip = strip ),
304+ extract_unencoded_urls (data , refang = refang , strip = strip , open_punc = open_punc ),
305+ extract_encoded_urls (data , refang = refang , strip = strip , delimiter = delimiter ),
270306 )
271307
272308
273- def extract_unencoded_urls (data , refang = False , strip = False ):
309+ def extract_unencoded_urls (data , refang = False , strip = False , open_punc = False ):
274310 """Extract only unencoded URLs.
275311
276312 :param data: Input text
277313 :param bool refang: Refang output?
278314 :param bool strip: Strip possible garbage from the end of URLs
279315 :rtype: Iterator[:class:`str`]
280316 """
317+
281318 unencoded_urls = itertools .chain (
282- GENERIC_URL_RE .finditer (data ),
319+ url_re ( open_punc ) .finditer (data ),
283320 BRACKET_URL_RE .finditer (data ),
284321 BACKSLASH_URL_RE .finditer (data ),
285322 )
@@ -303,7 +340,7 @@ def found_ws(s):
303340 yield url
304341
305342
306- def extract_encoded_urls (data , refang = False , strip = False ):
343+ def extract_encoded_urls (data , refang = False , strip = False , delimiter = None ):
307344 """Extract only encoded URLs.
308345
309346 :param data: Input text
@@ -340,9 +377,11 @@ def extract_encoded_urls(data, refang=False, strip=False):
340377 # The only valid starts are "http" or "ftp", so look for h/f case insensitive.
341378 url = url [re .search ('[hHfF]' , url ).start ():]
342379
380+ if delimiter == "space" :
381+ pass
382+ else :
343383 # Stop at the first whitespace or non-unicode character.
344- url = url .split (u'\ufffd ' )[0 ].\
345- split ()[0 ]
384+ url = url .split (u'\ufffd ' )[0 ].split ()[0 ]
346385
347386 if strip :
348387 url = re .split (URL_SPLIT_STR , url )[0 ]
@@ -726,6 +765,7 @@ def main():
726765 parser .add_argument ('--wide' , action = 'store_true' ,
727766 help = "preprocess input to allow wide-encoded character matches. default: no" )
728767 parser .add_argument ('--json' , action = 'store_true' )
768+ parser .add_argument ('--open' , action = 'store_true' , help = "Removes the end puncuation regex when extracting URLs" )
729769
730770 args = parser .parse_args ()
731771
@@ -755,6 +795,8 @@ def main():
755795 memo ["ipv6s" ] = list (extract_ipv6s (data ))
756796 if args .extract_urls or extract_all :
757797 memo ["urls" ] = list (extract_urls (data , refang = args .refang , strip = args .strip_urls ))
798+ if args .extract_urls and args .open :
799+ memo ["urls" ] = list (extract_urls (data , refang = args .refang , strip = args .strip_urls , open_punc = True ))
758800 if args .extract_yara_rules or extract_all :
759801 memo ["yara_rules" ] = list (extract_yara_rules (data ))
760802 if args .extract_hashes or extract_all :
0 commit comments