@@ -256,7 +256,7 @@ def extract_iocs(data, refang=False, strip=False):
256256 )
257257
258258
259- def extract_urls (data , refang = False , strip = False ):
259+ def extract_urls (data , refang = False , strip = False , delimiter = None ):
260260 """Extract URLs.
261261
262262 :param data: Input text
@@ -266,7 +266,7 @@ def extract_urls(data, refang=False, strip=False):
266266 """
267267 return itertools .chain (
268268 extract_unencoded_urls (data , refang = refang , strip = strip ),
269- extract_encoded_urls (data , refang = refang , strip = strip ),
269+ extract_encoded_urls (data , refang = refang , strip = strip , delimiter = delimiter ),
270270 )
271271
272272
@@ -303,7 +303,7 @@ def found_ws(s):
303303 yield url
304304
305305
306- def extract_encoded_urls (data , refang = False , strip = False ):
306+ def extract_encoded_urls (data , refang = False , strip = False , delimiter = None ):
307307 """Extract only encoded URLs.
308308
309309 :param data: Input text
@@ -340,9 +340,11 @@ def extract_encoded_urls(data, refang=False, strip=False):
340340 # The only valid starts are "http" or "ftp", so look for h/f case insensitive.
341341 url = url [re .search ('[hHfF]' , url ).start ():]
342342
343+ if delimiter == "space" :
344+ pass
345+ else :
343346 # Stop at the first whitespace or non-unicode character.
344- url = url .split (u'\ufffd ' )[0 ].\
345- split ()[0 ]
347+ url = url .split (u'\ufffd ' )[0 ].split ()[0 ]
346348
347349 if strip :
348350 url = re .split (URL_SPLIT_STR , url )[0 ]
0 commit comments