-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathocr.py
More file actions
48 lines (35 loc) · 1.54 KB
/
ocr.py
File metadata and controls
48 lines (35 loc) · 1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import os
import io
# Set Tesseract path (update if installed elsewhere)
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\acer\Tesseract-OCR\tesseract.exe'
def pdf_to_text(input_pdf, output_folder):
# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)
# Open PDF
print(f"Converting PDF: {input_pdf}")
pdf_document = fitz.open(input_pdf)
all_text = []
# Process each page
for page_num in range(len(pdf_document)):
print(f"Processing page {page_num + 1}/{len(pdf_document)}")
page = pdf_document[page_num]
# Convert page to image
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) # 2x zoom for better quality
img = Image.open(io.BytesIO(pix.tobytes("png")))
# Extract text from image
text = pytesseract.image_to_string(img)
all_text.append(f"\n## Page {page_num + 1}\n\n{text}")
pdf_document.close()
# Write all text to a markdown file
output_file = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(input_pdf))[0]}.md")
with open(output_file, "w", encoding="utf-8") as f:
f.write(f"# {os.path.splitext(os.path.basename(input_pdf))[0]}\n\n")
f.write("\n".join(all_text))
print(f"Text extracted and saved to: {output_file}")
if __name__ == "__main__":
input_pdf = r"data\bhagavad-gita-as-it-is.pdf"
output_folder = "data"
pdf_to_text(input_pdf, output_folder)