Skip to content

Commit 59c1ace

Browse files
authored
feat: remove pdfs and enable online pdf readings (microsoft#183)
* remove pdfs and enable online pdf readings * update doc format * use url as key
1 parent a8b22ce commit 59c1ace

10 files changed

Lines changed: 36 additions & 21 deletions

File tree

rdagent/app/general_model/general_model.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,22 @@
2020
def extract_models_and_implement(
2121
report_file_path: str,
2222
) -> None:
23+
"""
24+
Extracts models from a given PDF report file and implements the necessary operations.
25+
26+
Parameters:
27+
report_file_path (str): The path to the report file. The file must be a PDF file.
28+
29+
Example URLs of PDF reports:
30+
- https://arxiv.org/pdf/2210.09789
31+
- https://arxiv.org/pdf/2305.10498
32+
- https://arxiv.org/pdf/2110.14446
33+
- https://arxiv.org/pdf/2205.12454
34+
- https://arxiv.org/pdf/2210.16518
35+
36+
Returns:
37+
None
38+
"""
2339
with logger.tag("init"):
2440
scenario = GeneralModelScenario()
2541
logger.log_object(scenario, tag="scenario")

rdagent/app/qlib_rd_loop/factor_from_report_w_sc.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def extract_hypothesis_and_exp_from_reports(report_file_path: str) -> Tuple[Qlib
8585
pdf_screenshot = extract_first_page_screenshot_from_pdf(report_file_path)
8686
logger.log_object(pdf_screenshot)
8787

88-
docs_dict = load_and_process_pdfs_by_langchain(Path(report_file_path))
88+
docs_dict = load_and_process_pdfs_by_langchain(report_file_path)
8989

9090
factor_result = {
9191
task.factor_name: {

rdagent/components/coder/model_coder/task_loader.py

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -115,23 +115,12 @@ def load(self, model_dict: dict) -> list:
115115

116116

117117
class ModelExperimentLoaderFromPDFfiles(ModelTaskLoader):
118-
def load(self, file_or_folder_path: Path) -> dict:
119-
docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path)) # dict{file_path:content}
118+
def load(self, file_or_folder_path: str) -> dict:
119+
docs_dict = load_and_process_pdfs_by_langchain(file_or_folder_path) # dict{file_path:content}
120120
model_dict = extract_model_from_docs(
121121
docs_dict
122122
) # dict{file_name: dict{model_name: dict{description, formulation, variables}}}
123123
model_dict = merge_file_to_model_dict_to_model_dict(
124124
model_dict
125125
) # dict {model_name: dict{description, formulation, variables}}
126126
return ModelExperimentLoaderFromDict().load(model_dict)
127-
128-
129-
def main(path="../test_doc"):
130-
doc_dict = load_and_process_pdfs_by_langchain(Path(path))
131-
print(doc_dict.keys()) # if you run code like "python -u", the print content will be truncated
132-
133-
134-
import fire
135-
136-
if __name__ == "__main__":
137-
fire.Fire(main)

rdagent/components/document_reader/document_reader.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,11 @@
66
import fitz
77
from azure.ai.formrecognizer import DocumentAnalysisClient
88
from azure.core.credentials import AzureKeyCredential
9-
from langchain.document_loaders import PyPDFDirectoryLoader, PyPDFLoader
9+
from langchain.document_loaders import (
10+
OnlinePDFLoader,
11+
PyPDFDirectoryLoader,
12+
PyPDFLoader,
13+
)
1014
from PIL import Image
1115

1216
if TYPE_CHECKING:
@@ -15,7 +19,7 @@
1519
from rdagent.core.conf import RD_AGENT_SETTINGS
1620

1721

18-
def load_documents_by_langchain(path: Path) -> list:
22+
def load_documents_by_langchain(path: str) -> list:
1923
"""Load documents from the specified path.
2024
2125
Args:
@@ -24,7 +28,10 @@ def load_documents_by_langchain(path: Path) -> list:
2428
Returns:
2529
list: A list of loaded documents.
2630
"""
27-
loader = PyPDFDirectoryLoader(str(path), silent_errors=True) if path.is_dir() else PyPDFLoader(str(path))
31+
if Path(path).is_dir():
32+
loader = PyPDFDirectoryLoader(path, silent_errors=True)
33+
else:
34+
loader = PyPDFLoader(path)
2835
return loader.load()
2936

3037

@@ -41,7 +48,10 @@ def process_documents_by_langchain(docs: list[Document]) -> dict[str, str]:
4148
content_dict = {}
4249

4350
for doc in docs:
44-
doc_name = str(Path(doc.metadata["source"]).resolve())
51+
if Path(doc.metadata["source"]).exists():
52+
doc_name = str(Path(doc.metadata["source"]).resolve())
53+
else:
54+
doc_name = doc.metadata["source"]
4555
doc_content = doc.page_content
4656

4757
if doc_name not in content_dict:
@@ -52,7 +62,7 @@ def process_documents_by_langchain(docs: list[Document]) -> dict[str, str]:
5262
return content_dict
5363

5464

55-
def load_and_process_pdfs_by_langchain(path: Path) -> dict[str, str]:
65+
def load_and_process_pdfs_by_langchain(path: str) -> dict[str, str]:
5666
return process_documents_by_langchain(load_documents_by_langchain(path))
5767

5868

-1.04 MB
Binary file not shown.
-1.09 MB
Binary file not shown.
-1.17 MB
Binary file not shown.
-1.19 MB
Binary file not shown.
-13.4 MB
Binary file not shown.

rdagent/scenarios/qlib/factor_experiment_loader/pdf_loader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -509,9 +509,9 @@ def deduplicate_factors_by_llm( # noqa: C901, PLR0912
509509

510510

511511
class FactorExperimentLoaderFromPDFfiles(FactorExperimentLoader):
512-
def load(self, file_or_folder_path: Path) -> dict:
512+
def load(self, file_or_folder_path: str) -> dict:
513513
with logger.tag("docs"):
514-
docs_dict = load_and_process_pdfs_by_langchain(Path(file_or_folder_path))
514+
docs_dict = load_and_process_pdfs_by_langchain(file_or_folder_path)
515515
logger.log_object(docs_dict)
516516

517517
selected_report_dict = classify_report_from_dict(report_dict=docs_dict, vote_time=1)

0 commit comments

Comments
 (0)