9
9
import multiprocessing
10
10
import os
11
11
import re
12
+ import shutil
12
13
import signal
13
14
import timeit
14
15
import unicodedata
15
16
from urllib .parse import urlparse , urlunparse
16
17
18
+ import cairosvg
19
+ import docx
20
+ import docx2txt
17
21
import easyocr
18
22
import fitz
19
23
import numpy as np
20
24
import pandas as pd
25
+ import pptx
21
26
import requests
22
27
import yaml
23
28
from bs4 import BeautifulSoup
27
32
UnstructuredHTMLLoader ,
28
33
UnstructuredImageLoader ,
29
34
UnstructuredMarkdownLoader ,
30
- UnstructuredPowerPointLoader ,
31
35
UnstructuredXMLLoader ,
32
36
)
33
37
from langchain_community .llms import HuggingFaceEndpoint
@@ -131,32 +135,81 @@ def load_txt(txt_path):
131
135
132
136
def load_doc (doc_path ):
133
137
"""Load doc file."""
134
- txt_path = doc_path .replace (".doc" , ".txt" )
135
- try :
136
- os .system (f'antiword "{ doc_path } " > "{ txt_path } "' )
137
- except :
138
- raise AssertionError (
139
- "antiword failed or not installed, if not installed,"
140
- + 'use "apt-get update && apt-get install -y antiword" to install it.'
141
- )
142
- text = load_txt (txt_path )
143
- os .remove (txt_path )
138
+ print ("Converting doc file to docx file..." )
139
+ docx_path = doc_path + "x"
140
+ os .system (f"libreoffice --headless --invisible --convert-to docx --outdir { os .path .dirname (docx_path )} { doc_path } " )
141
+ print ("Converted doc file to docx file." )
142
+ text = load_docx (docx_path )
143
+ os .remove (docx_path )
144
144
return text
145
145
146
146
147
147
def load_docx (docx_path ):
148
148
"""Load docx file."""
149
- doc = DDocument (docx_path )
149
+ doc = docx . Document (docx_path )
150
150
text = ""
151
+ # Save all 'rId:filenames' relationships in an dictionary and save the images if any.
152
+ rid2img = {}
153
+ for r in doc .part .rels .values ():
154
+ if isinstance (r ._target , docx .parts .image .ImagePart ):
155
+ rid2img [r .rId ] = os .path .basename (r ._target .partname )
156
+ if rid2img :
157
+ save_path = "./imgs/"
158
+ os .makedirs (save_path , exist_ok = True )
159
+ docx2txt .process (docx_path , save_path )
151
160
for paragraph in doc .paragraphs :
152
- text += paragraph .text
161
+ if hasattr (paragraph , "text" ):
162
+ text += paragraph .text + "\n "
163
+ if "graphicData" in paragraph ._p .xml :
164
+ for rid in rid2img :
165
+ if rid in paragraph ._p .xml :
166
+ img_path = os .path .join (save_path , rid2img [rid ])
167
+ img_text = load_image (img_path )
168
+ if img_text :
169
+ text += img_text + "\n "
170
+ if rid2img :
171
+ shutil .rmtree (save_path )
172
+ return text
173
+
174
+
175
+ def load_ppt (ppt_path ):
176
+ """Load ppt file."""
177
+ print ("Converting ppt file to pptx file..." )
178
+ pptx_path = ppt_path + "x"
179
+ os .system (f"libreoffice --headless --invisible --convert-to pptx --outdir { os .path .dirname (pptx_path )} { ppt_path } " )
180
+ print ("Converted ppt file to pptx file." )
181
+ text = load_pptx (pptx_path )
182
+ os .remove (pptx_path )
153
183
return text
154
184
155
185
156
186
def load_pptx (pptx_path ):
157
187
"""Load pptx file."""
158
- loader = UnstructuredPowerPointLoader (pptx_path )
159
- text = loader .load ()[0 ].page_content
188
+ text = ""
189
+ prs = pptx .Presentation (pptx_path )
190
+ for slide in prs .slides :
191
+ for shape in sorted (slide .shapes , key = lambda shape : (shape .top , shape .left )):
192
+ if shape .has_text_frame :
193
+ if shape .text :
194
+ text += shape .text + "\n "
195
+ if shape .has_table :
196
+ table_contents = "\n " .join (
197
+ [
198
+ "\t " .join ([(cell .text if hasattr (cell , "text" ) else "" ) for cell in row .cells ])
199
+ for row in shape .table .rows
200
+ if hasattr (row , "cells" )
201
+ ]
202
+ )
203
+ if table_contents :
204
+ text += table_contents + "\n "
205
+ if hasattr (shape , "image" ) and hasattr (shape .image , "blob" ):
206
+ img_path = f"./{ shape .image .filename } "
207
+ with open (img_path , "wb" ) as f :
208
+ f .write (shape .image .blob )
209
+ img_text = load_image (img_path )
210
+ if img_text :
211
+ text += img_text + "\n "
212
+ os .remove (img_path )
160
213
return text
161
214
162
215
@@ -214,13 +267,11 @@ def load_image(image_path):
214
267
return response .json ()["text" ].strip ()
215
268
loader = UnstructuredImageLoader (image_path )
216
269
text = loader .load ()[0 ].page_content
217
- return text
270
+ return text . strip ()
218
271
219
272
220
273
def load_svg (svg_path ):
221
274
"""Load the svg file."""
222
- import cairosvg
223
-
224
275
png_path = svg_path .replace (".svg" , ".png" )
225
276
cairosvg .svg2png (url = svg_path , write_to = png_path )
226
277
text = load_image (png_path )
@@ -239,7 +290,9 @@ def document_loader(doc_path):
239
290
return load_doc (doc_path )
240
291
elif doc_path .endswith (".docx" ):
241
292
return load_docx (doc_path )
242
- elif doc_path .endswith (".pptx" ) or doc_path .endswith (".ppt" ):
293
+ elif doc_path .endswith (".ppt" ):
294
+ return load_ppt (doc_path )
295
+ elif doc_path .endswith (".pptx" ):
243
296
return load_pptx (doc_path )
244
297
elif doc_path .endswith (".md" ):
245
298
return load_md (doc_path )
@@ -261,7 +314,7 @@ def document_loader(doc_path):
261
314
):
262
315
return load_image (doc_path )
263
316
elif doc_path .endswith (".svg" ):
264
- return load_image (doc_path )
317
+ return load_svg (doc_path )
265
318
else :
266
319
raise NotImplementedError (
267
320
"Current only support pdf, html, txt, doc, docx, pptx, ppt, md, xml"
0 commit comments