Skip to content

Commit e237454

Browse files
Added support for extracting info from image in the docs (#120)
Signed-off-by: Ye, Xinyu <[email protected]>
1 parent 72a48d0 commit e237454

File tree

7 files changed

+95
-22
lines changed

7 files changed

+95
-22
lines changed

comps/dataprep/README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
The Dataprep Microservice aims to preprocess the data from various sources (either structured or unstructured data) to text data, and convert the text data to embedding vectors then store them in the database.
44

5+
## Install Requirements
6+
7+
```bash
8+
apt-get update
9+
apt-get install libreoffice
10+
```
11+
512
## Use LVM (Large Vision Model) for Summarizing Image Data
613

714
Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.

comps/dataprep/milvus/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
beautifulsoup4
2+
cairosvg
23
docarray[full]
4+
docx2txt
35
easyocr
46
fastapi
57
frontend==0.0.3
@@ -8,6 +10,7 @@ langchain
810
langchain-community
911
langchain-text-splitters
1012
langchain_milvus
13+
markdown
1114
numpy
1215
opentelemetry-api
1316
opentelemetry-exporter-otlp
@@ -19,6 +22,7 @@ pydantic==2.7.3
1922
pymilvus==2.4.3
2023
pymupdf==1.24.5
2124
python-docx==0.8.11
25+
python-pptx
2226
sentence_transformers
2327
shortuuid
2428
unstructured[all-docs]==0.11.5

comps/dataprep/qdrant/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ apt-get install poppler-utils -y
1313

1414
## Start Qdrant Server
1515

16-
Please refer to this [readme](../../../vectorstores/langchain/qdrant/README.md).
16+
Please refer to this [readme](../../vectorstores/langchain/qdrant/README.md).
1717

1818
## Setup Environment Variables
1919

@@ -24,6 +24,7 @@ export https_proxy=${your_http_proxy}
2424
export QDRANT=${host_ip}
2525
export QDRANT_PORT=6333
2626
export COLLECTION_NAME=${your_collection_name}
27+
export PYTHONPATH=${path_to_comps}
2728
```
2829

2930
## Start Document Preparation Microservice for Qdrant with Python Script

comps/dataprep/qdrant/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
beautifulsoup4
2+
cairosvg
23
docarray[full]
4+
docx2txt
35
easyocr
46
fastapi
57
huggingface_hub
68
langchain
79
langchain-community
810
langchain-text-splitters
11+
markdown
912
numpy
1013
opentelemetry-api
1114
opentelemetry-exporter-otlp
@@ -15,6 +18,7 @@ Pillow
1518
prometheus-fastapi-instrumentator
1619
pymupdf
1720
python-docx
21+
python-pptx
1822
qdrant-client
1923
sentence_transformers
2024
shortuuid

comps/dataprep/redis/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ cd langchain_ray; pip install -r requirements_ray.txt
3131

3232
## 1.2 Start Redis Stack Server
3333

34-
Please refer to this [readme](../../../vectorstores/langchain/redis/README.md).
34+
Please refer to this [readme](../../vectorstores/langchain/redis/README.md).
3535

3636
## 1.3 Setup Environment Variables
3737

@@ -41,6 +41,7 @@ export INDEX_NAME=${your_index_name}
4141
export LANGCHAIN_TRACING_V2=true
4242
export LANGCHAIN_API_KEY=${your_langchain_api_key}
4343
export LANGCHAIN_PROJECT="opea/gen-ai-comps:dataprep"
44+
export PYTHONPATH=${path_to_comps}
4445
```
4546

4647
## 1.4 Start Document Preparation Microservice for Redis with Python Script

comps/dataprep/redis/langchain/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
beautifulsoup4
2+
cairosvg
23
docarray[full]
4+
docx2txt
35
easyocr
46
fastapi
57
huggingface_hub
@@ -18,6 +20,7 @@ prometheus-fastapi-instrumentator
1820
pymupdf
1921
pyspark
2022
python-docx
23+
python-pptx
2124
redis
2225
sentence_transformers
2326
shortuuid

comps/dataprep/utils.py

Lines changed: 73 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,20 @@
99
import multiprocessing
1010
import os
1111
import re
12+
import shutil
1213
import signal
1314
import timeit
1415
import unicodedata
1516
from urllib.parse import urlparse, urlunparse
1617

18+
import cairosvg
19+
import docx
20+
import docx2txt
1721
import easyocr
1822
import fitz
1923
import numpy as np
2024
import pandas as pd
25+
import pptx
2126
import requests
2227
import yaml
2328
from bs4 import BeautifulSoup
@@ -27,7 +32,6 @@
2732
UnstructuredHTMLLoader,
2833
UnstructuredImageLoader,
2934
UnstructuredMarkdownLoader,
30-
UnstructuredPowerPointLoader,
3135
UnstructuredXMLLoader,
3236
)
3337
from langchain_community.llms import HuggingFaceEndpoint
@@ -131,32 +135,81 @@ def load_txt(txt_path):
131135

132136
def load_doc(doc_path):
133137
"""Load doc file."""
134-
txt_path = doc_path.replace(".doc", ".txt")
135-
try:
136-
os.system(f'antiword "{doc_path}" > "{txt_path}"')
137-
except:
138-
raise AssertionError(
139-
"antiword failed or not installed, if not installed,"
140-
+ 'use "apt-get update && apt-get install -y antiword" to install it.'
141-
)
142-
text = load_txt(txt_path)
143-
os.remove(txt_path)
138+
print("Converting doc file to docx file...")
139+
docx_path = doc_path + "x"
140+
os.system(f"libreoffice --headless --invisible --convert-to docx --outdir {os.path.dirname(docx_path)} {doc_path}")
141+
print("Converted doc file to docx file.")
142+
text = load_docx(docx_path)
143+
os.remove(docx_path)
144144
return text
145145

146146

147147
def load_docx(docx_path):
148148
"""Load docx file."""
149-
doc = DDocument(docx_path)
149+
doc = docx.Document(docx_path)
150150
text = ""
151+
# Save all 'rId:filenames' relationships in an dictionary and save the images if any.
152+
rid2img = {}
153+
for r in doc.part.rels.values():
154+
if isinstance(r._target, docx.parts.image.ImagePart):
155+
rid2img[r.rId] = os.path.basename(r._target.partname)
156+
if rid2img:
157+
save_path = "./imgs/"
158+
os.makedirs(save_path, exist_ok=True)
159+
docx2txt.process(docx_path, save_path)
151160
for paragraph in doc.paragraphs:
152-
text += paragraph.text
161+
if hasattr(paragraph, "text"):
162+
text += paragraph.text + "\n"
163+
if "graphicData" in paragraph._p.xml:
164+
for rid in rid2img:
165+
if rid in paragraph._p.xml:
166+
img_path = os.path.join(save_path, rid2img[rid])
167+
img_text = load_image(img_path)
168+
if img_text:
169+
text += img_text + "\n"
170+
if rid2img:
171+
shutil.rmtree(save_path)
172+
return text
173+
174+
175+
def load_ppt(ppt_path):
176+
"""Load ppt file."""
177+
print("Converting ppt file to pptx file...")
178+
pptx_path = ppt_path + "x"
179+
os.system(f"libreoffice --headless --invisible --convert-to pptx --outdir {os.path.dirname(pptx_path)} {ppt_path}")
180+
print("Converted ppt file to pptx file.")
181+
text = load_pptx(pptx_path)
182+
os.remove(pptx_path)
153183
return text
154184

155185

156186
def load_pptx(pptx_path):
157187
"""Load pptx file."""
158-
loader = UnstructuredPowerPointLoader(pptx_path)
159-
text = loader.load()[0].page_content
188+
text = ""
189+
prs = pptx.Presentation(pptx_path)
190+
for slide in prs.slides:
191+
for shape in sorted(slide.shapes, key=lambda shape: (shape.top, shape.left)):
192+
if shape.has_text_frame:
193+
if shape.text:
194+
text += shape.text + "\n"
195+
if shape.has_table:
196+
table_contents = "\n".join(
197+
[
198+
"\t".join([(cell.text if hasattr(cell, "text") else "") for cell in row.cells])
199+
for row in shape.table.rows
200+
if hasattr(row, "cells")
201+
]
202+
)
203+
if table_contents:
204+
text += table_contents + "\n"
205+
if hasattr(shape, "image") and hasattr(shape.image, "blob"):
206+
img_path = f"./{shape.image.filename}"
207+
with open(img_path, "wb") as f:
208+
f.write(shape.image.blob)
209+
img_text = load_image(img_path)
210+
if img_text:
211+
text += img_text + "\n"
212+
os.remove(img_path)
160213
return text
161214

162215

@@ -214,13 +267,11 @@ def load_image(image_path):
214267
return response.json()["text"].strip()
215268
loader = UnstructuredImageLoader(image_path)
216269
text = loader.load()[0].page_content
217-
return text
270+
return text.strip()
218271

219272

220273
def load_svg(svg_path):
221274
"""Load the svg file."""
222-
import cairosvg
223-
224275
png_path = svg_path.replace(".svg", ".png")
225276
cairosvg.svg2png(url=svg_path, write_to=png_path)
226277
text = load_image(png_path)
@@ -239,7 +290,9 @@ def document_loader(doc_path):
239290
return load_doc(doc_path)
240291
elif doc_path.endswith(".docx"):
241292
return load_docx(doc_path)
242-
elif doc_path.endswith(".pptx") or doc_path.endswith(".ppt"):
293+
elif doc_path.endswith(".ppt"):
294+
return load_ppt(doc_path)
295+
elif doc_path.endswith(".pptx"):
243296
return load_pptx(doc_path)
244297
elif doc_path.endswith(".md"):
245298
return load_md(doc_path)
@@ -261,7 +314,7 @@ def document_loader(doc_path):
261314
):
262315
return load_image(doc_path)
263316
elif doc_path.endswith(".svg"):
264-
return load_image(doc_path)
317+
return load_svg(doc_path)
265318
else:
266319
raise NotImplementedError(
267320
"Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"

0 commit comments

Comments
 (0)