tabuparse
is a Python CLI tool and library for extracting, normalizing, and merging tabular data from PDF documents.
Warning
This project is still in alpha mode and might go sideways.
git clone https://github.com/lupeke/tabuparse.git && \
cd tabuparse && \
python3 -m venv .venv && source .venv/bin/activate && \
pip install -e .
python tests/check_install.py
# Process single PDF with default settings
tabuparse process example.pdf
# Process multiple PDFs with configuration
tabuparse process *.pdf --config settings.toml --output data.csv
# Export to SQLite with summary statistics
tabuparse process documents/*.pdf --format sqlite --summary
# Preview processing without extraction
tabuparse preview *.pdf --config settings.toml
# Extract from single PDF for testing
tabuparse extract document.pdf --pages "1-3" --flavor stream
import asyncio
from tabuparse import process_pdfs
async def main():
# Process PDFs and get merged DataFrame
result_df = await process_pdfs(
pdf_paths=['invoice1.pdf', 'invoice2.pdf'],
config_path='schema.toml',
output_format='csv'
)
print(f"Extracted {len(result_df)} rows")
print(result_df.head())
asyncio.run(main())
tabuparse
uses TOML configuration files to define extraction parameters and expected schemas.
tabuparse init-config settings.toml --columns "Invoice ID,Date,Amount,Description"
# settings.toml
[table_structure]
expected_columns = [
"Invoice ID",
"Date",
"Item Description",
"Quantity",
"Unit Price",
"Total Amount"
]
[settings]
output_format = "csv"
strict_schema = false
[default_extraction]
flavor = "lattice"
pages = "all"
# PDF-specific extraction parameters
[[extraction_parameters]]
pdf_path = "invoice_batch_1.pdf"
pages = "1-5"
flavor = "lattice"
[[extraction_parameters]]
pdf_path = "statements.pdf"
pages = "all"
flavor = "stream"
table_areas = ["72,72,432,648"] # left,bottom,right,top in points
expected_columns
: List of column names for schema normalization
output_format
: "csv" or "sqlite"strict_schema
: Enable strict schema validation (fail on mismatches)
pages
: Page selection ("all", "1", "1,3,5", "1-3")flavor
: Camelot extraction method ("lattice" or "stream")table_areas
: Specific table regions to extractpdf_path
: Apply parameters to specific PDF files
Extract and merge tables from multiple PDF files.
tabuparse process file1.pdf file2.pdf [OPTIONS]
Options:
-c, --config PATH TOML configuration file
-o, --output PATH Output file path
--format [csv|sqlite] Output format (default: csv)
--max-concurrent INT Max concurrent extractions (default: 5)
--summary Export summary statistics
--no-clean Disable data cleaning
--strict Enable strict schema validation
Extract tables from a single PDF (for testing).
tabuparse extract document.pdf [OPTIONS]
Options:
-c, --config PATH Configuration file
--pages TEXT Pages to extract
--flavor [lattice|stream] Extraction method
--show-info Show detailed table information
Preview processing statistics without extraction.
tabuparse preview file1.pdf file2.pdf [OPTIONS]
Options:
-c, --config PATH Configuration file
Generate sample configuration file.
tabuparse init-config config.toml [OPTIONS]
Options:
--columns TEXT Expected column names (comma-separated)
--format [csv|sqlite] Default output format
--flavor [lattice|stream] Default extraction flavor
Validate PDF file compatibility.
tabuparse validate document.pdf
from tabuparse import process_pdfs, extract_from_single_pdf
# Process multiple PDFs
result_df = await process_pdfs(
pdf_paths=['file1.pdf', 'file2.pdf'],
config_path='settings.toml',
output_path='output.csv',
output_format='csv',
max_concurrent=5
)
# Extract from single PDF
tables = await extract_from_single_pdf(
'document.pdf',
config_path='settings.toml'
)
from tabuparse.config_parser import parse_config, TabuparseConfig
# Load configuration
config = parse_config('settings.toml')
# Create programmatic configuration
config = TabuparseConfig(
expected_columns=['ID', 'Name', 'Amount'],
output_format='sqlite'
)
from tabuparse.data_processor import normalize_schema, merge_dataframes
# Normalize DataFrame schema
normalized_df = normalize_schema(
df,
expected_columns=['ID', 'Name', 'Amount'],
strict_mode=False
)
# Merge multiple DataFrames
merged_df = merge_dataframes([df1, df2, df3])
# Process invoice PDFs with predefined schema
tabuparse process invoices/*.pdf --config invoice_schema.toml --output invoices.csv
import asyncio
from tabuparse import process_pdfs
async def process_financial_data():
# Extract data
df = await process_pdfs(
pdf_paths=['q1_report.pdf', 'q2_report.pdf'],
config_path='financial_schema.toml'
)
# Export to multiple formats
df.to_csv('financial_data.csv', index=False)
df.to_excel('financial_data.xlsx', index=False)
return df
asyncio.run(process_financial_data())
from tabuparse.pdf_extractor import extract_tables_from_pdf
from tabuparse.data_processor import normalize_schema
from tabuparse.output_writer import write_sqlite
async def custom_pipeline():
# Extract tables
tables = await extract_tables_from_pdf('document.pdf')
# Process each table
processed_tables = []
for table in tables:
normalized = normalize_schema(
table,
expected_columns=['ID', 'Date', 'Amount']
)
processed_tables.append(normalized)
# Merge and export
import pandas as pd
merged = pd.concat(processed_tables, ignore_index=True)
write_sqlite(merged, 'output.sqlite', table_name='extracted_data')
asyncio.run(custom_pipeline())
Samplings icons by Afian Rochmah Afif - Flaticon