Skip to content

Commit 01886fe

Browse files
Dataprep fetch page fix (#588)
* dataprep: Fix to decode url start with "http";Get the encoding attr of web page Signed-off-by: Cathy Zhang <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 3ce387a commit 01886fe

File tree

1 file changed

+46
-3
lines changed

1 file changed

+46
-3
lines changed

comps/dataprep/utils.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@
4141
from langchain_community.llms import HuggingFaceEndpoint
4242
from PIL import Image
4343

44+
from comps import CustomLogger
45+
46+
logger = CustomLogger("prepare_doc_util")
47+
logflag = os.getenv("LOGFLAG", False)
48+
4449

4550
class TimeoutError(Exception):
4651
pass
@@ -428,14 +433,51 @@ def fetch(self, url, headers=None, max_times=5):
428433
if not headers:
429434
headers = self.headers
430435
while max_times:
431-
if not url.startswith("http") or not url.startswith("https"):
436+
parsed_url = urlparse(url)
437+
if not parsed_url.scheme:
432438
url = "http://" + url
433-
print("start fetch %s...", url)
439+
if logflag:
440+
logger.info("start fetch %s..." % url)
434441
try:
435442
response = requests.get(url, headers=headers, verify=True)
436443
if response.status_code != 200:
437444
print("fail to fetch %s, response status code: %s", url, response.status_code)
438445
else:
446+
# Extract charset from the Content-Type header
447+
content_type = response.headers.get("Content-Type", "").lower()
448+
if "charset=" in content_type:
449+
# Extract charset value from the content-type header
450+
charset = content_type.split("charset=")[-1].strip()
451+
response.encoding = charset
452+
if logflag:
453+
logger.info(f"Charset detected and set: {response.encoding}")
454+
else:
455+
import re
456+
457+
# Extract charset from the response HTML content
458+
charset_from_meta = None
459+
# Check for <meta charset="...">
460+
match = re.search(r'<meta\s+charset=["\']?([^"\'>]+)["\']?', response.text, re.IGNORECASE)
461+
if match:
462+
charset_from_meta = match.group(1)
463+
# Check for <meta http-equiv="Content-Type" content="...; charset=...">
464+
if not charset_from_meta:
465+
match = re.search(
466+
r'<meta\s+http-equiv=["\']?content-type["\']?\s+content=["\']?[^"\']*charset=([^"\'>]+)["\']?',
467+
response.text,
468+
re.IGNORECASE,
469+
)
470+
if match:
471+
charset_from_meta = match.group(1)
472+
if charset_from_meta:
473+
response.encoding = charset_from_meta
474+
if logflag:
475+
logger.info(f"Charset detected and set from meta tag: {response.encoding}")
476+
else:
477+
# Fallback to default encoding
478+
response.encoding = "utf-8"
479+
if logflag:
480+
logger.info("Charset not specified, using default utf-8")
439481
return response
440482
except Exception as e:
441483
print("fail to fetch %s, caused by %s", url, e)
@@ -540,8 +582,9 @@ def load_html_data(url):
540582
main_content = all_text if main_content == "" else main_content
541583
main_content = main_content.replace("\n", "")
542584
main_content = main_content.replace("\n\n", "")
543-
main_content = uni_pro(main_content)
544585
main_content = re.sub(r"\s+", " ", main_content)
586+
if logflag:
587+
logger.info("main_content=[%s]" % main_content)
545588

546589
return main_content
547590

0 commit comments

Comments
 (0)