"""Simple PDF-to-JSON demo using PyMuPDF.

This is intentionally much simpler than the full StatsChat pipeline.
It shows the core idea: open a PDF, extract text page by page,
and save the result as JSON.

Usage:
    python pdf_to_json_demo.py path/to/report.pdf

Optional:
    python pdf_to_json_demo.py path/to/report.pdf --out report.json
"""

from __future__ import annotations

import argparse
import json
from datetime import datetime, timezone
from pathlib import Path

import fitz  # PyMuPDF


def pdf_to_json(pdf_path: Path, output_path: Path) -> None:
    """Convert a PDF into a simple page-level JSON file."""
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF not found: {pdf_path}")

    doc = fitz.open(pdf_path)

    pages = []
    for page_number, page in enumerate(doc, start=1):
        pages.append(
            {
                "page_number": page_number,
                "page_text": page.get_text(),
            }
        )

    output = {
        "metadata": {
            "source_file": pdf_path.name,
            "page_count": doc.page_count,
            "created_at": datetime.now(timezone.utc).isoformat(),
            "extractor": "PyMuPDF",
        },
        "pages": pages,
    }

    doc.close()

    with output_path.open("w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)

    print(f"Saved {len(pages)} pages to {output_path}")


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Convert a PDF to simple JSON.")
    parser.add_argument("pdf", type=Path, help="Path to the PDF file")
    parser.add_argument(
        "--out",
        type=Path,
        default=None,
        help="Output JSON path. Defaults to the PDF name with .json extension.",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    output_path = args.out or args.pdf.with_suffix(".json")
    pdf_to_json(args.pdf, output_path)


if __name__ == "__main__":
    main()
