|
41 | 41 | from langchain_community.llms import HuggingFaceEndpoint
|
42 | 42 | from PIL import Image
|
43 | 43 |
|
| 44 | +from comps import CustomLogger |
| 45 | + |
| 46 | +logger = CustomLogger("prepare_doc_util") |
| 47 | +logflag = os.getenv("LOGFLAG", False) |
| 48 | + |
44 | 49 |
|
45 | 50 | class TimeoutError(Exception):
|
46 | 51 | pass
|
@@ -428,14 +433,51 @@ def fetch(self, url, headers=None, max_times=5):
|
428 | 433 | if not headers:
|
429 | 434 | headers = self.headers
|
430 | 435 | while max_times:
|
431 |
| - if not url.startswith("http") or not url.startswith("https"): |
| 436 | + parsed_url = urlparse(url) |
| 437 | + if not parsed_url.scheme: |
432 | 438 | url = "http://" + url
|
433 |
| - print("start fetch %s...", url) |
| 439 | + if logflag: |
| 440 | + logger.info("start fetch %s..." % url) |
434 | 441 | try:
|
435 | 442 | response = requests.get(url, headers=headers, verify=True)
|
436 | 443 | if response.status_code != 200:
|
437 | 444 | print("fail to fetch %s, response status code: %s", url, response.status_code)
|
438 | 445 | else:
|
| 446 | + # Extract charset from the Content-Type header |
| 447 | + content_type = response.headers.get("Content-Type", "").lower() |
| 448 | + if "charset=" in content_type: |
| 449 | + # Extract charset value from the content-type header |
| 450 | + charset = content_type.split("charset=")[-1].strip() |
| 451 | + response.encoding = charset |
| 452 | + if logflag: |
| 453 | + logger.info(f"Charset detected and set: {response.encoding}") |
| 454 | + else: |
| 455 | + import re |
| 456 | + |
| 457 | + # Extract charset from the response HTML content |
| 458 | + charset_from_meta = None |
| 459 | + # Check for <meta charset="..."> |
| 460 | + match = re.search(r'<meta\s+charset=["\']?([^"\'>]+)["\']?', response.text, re.IGNORECASE) |
| 461 | + if match: |
| 462 | + charset_from_meta = match.group(1) |
| 463 | + # Check for <meta http-equiv="Content-Type" content="...; charset=..."> |
| 464 | + if not charset_from_meta: |
| 465 | + match = re.search( |
| 466 | + r'<meta\s+http-equiv=["\']?content-type["\']?\s+content=["\']?[^"\']*charset=([^"\'>]+)["\']?', |
| 467 | + response.text, |
| 468 | + re.IGNORECASE, |
| 469 | + ) |
| 470 | + if match: |
| 471 | + charset_from_meta = match.group(1) |
| 472 | + if charset_from_meta: |
| 473 | + response.encoding = charset_from_meta |
| 474 | + if logflag: |
| 475 | + logger.info(f"Charset detected and set from meta tag: {response.encoding}") |
| 476 | + else: |
| 477 | + # Fallback to default encoding |
| 478 | + response.encoding = "utf-8" |
| 479 | + if logflag: |
| 480 | + logger.info("Charset not specified, using default utf-8") |
439 | 481 | return response
|
440 | 482 | except Exception as e:
|
441 | 483 | print("fail to fetch %s, caused by %s", url, e)
|
@@ -540,8 +582,9 @@ def load_html_data(url):
|
540 | 582 | main_content = all_text if main_content == "" else main_content
|
541 | 583 | main_content = main_content.replace("\n", "")
|
542 | 584 | main_content = main_content.replace("\n\n", "")
|
543 |
| - main_content = uni_pro(main_content) |
544 | 585 | main_content = re.sub(r"\s+", " ", main_content)
|
| 586 | + if logflag: |
| 587 | + logger.info("main_content=[%s]" % main_content) |
545 | 588 |
|
546 | 589 | return main_content
|
547 | 590 |
|
|
0 commit comments