VedicGPT/ocr.py at main · L0veMathur/VedicGPT · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os
import io

# Set Tesseract path (update if installed elsewhere)
pytesseract.pytesseract.tesseract_cmd = r'C:\Users\acer\Tesseract-OCR\tesseract.exe'

def pdf_to_text(input_pdf, output_folder):
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Open PDF
    print(f"Converting PDF: {input_pdf}")
    pdf_document = fitz.open(input_pdf)

    all_text = []

    # Process each page
    for page_num in range(len(pdf_document)):
        print(f"Processing page {page_num + 1}/{len(pdf_document)}")

        page = pdf_document[page_num]

        # Convert page to image
        pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better quality
        img = Image.open(io.BytesIO(pix.tobytes("png")))

        # Extract text from image
        text = pytesseract.image_to_string(img)
        all_text.append(f"\n## Page {page_num + 1}\n\n{text}")

    pdf_document.close()

    # Write all text to a markdown file
    output_file = os.path.join(output_folder, f"{os.path.splitext(os.path.basename(input_pdf))[0]}.md")
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(f"# {os.path.splitext(os.path.basename(input_pdf))[0]}\n\n")
        f.write("\n".join(all_text))

    print(f"Text extracted and saved to: {output_file}")

if __name__ == "__main__":
    input_pdf = r"data\bhagavad-gita-as-it-is.pdf"
    output_folder = "data"

    pdf_to_text(input_pdf, output_folder)