#!/usr/bin/env python3
from __future__ import annotations

import argparse
import string
from pathlib import Path


PUNCTUATION_TABLE = str.maketrans({mark: " " for mark in string.punctuation})


def normalize(text: str) -> str:
    return " ".join(text.lower().translate(PUNCTUATION_TABLE).split())


def expected_keywords(text: str) -> list[str]:
    words = [word for word in text.split() if len(word) >= 4 or word == "gpu"]
    return words if words else text.split()


def main() -> None:
    parser = argparse.ArgumentParser(description="Run a small Whisper ASR sanity check for generated test audio.")
    parser.add_argument("audio", type=Path)
    parser.add_argument("--expected", default="Welcome to Fontaine. The compiler stage is ready, and the GPU will speak.")
    parser.add_argument("--model", default="tiny.en")
    parser.add_argument("--output", type=Path)
    args = parser.parse_args()
    if not args.audio.exists():
        raise FileNotFoundError(f"Audio file not found: {args.audio}")

    import whisper

    model = whisper.load_model(args.model, device="cpu")
    result = model.transcribe(str(args.audio), fp16=False, language="en")
    text = result["text"].strip()
    expected = normalize(args.expected)
    actual = normalize(text)
    keywords = expected_keywords(expected)
    passed = bool(keywords) and all(word in actual for word in keywords)
    report = "\n".join(
        [
            f"audio={args.audio}",
            f"expected={args.expected}",
            f"transcript={text}",
            f"normalized_expected={expected}",
            f"normalized_transcript={actual}",
            f"expected_keywords={' '.join(keywords)}",
            f"passed={passed}",
            "",
        ]
    )
    if args.output:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        args.output.write_text(report, encoding="utf-8")
    print(report, end="")
    raise SystemExit(0 if passed else 1)


if __name__ == "__main__":
    main()
