#!/usr/bin/env python3
"""
AIPM 1.2 — Python Reference Implementation (version-locked to 1.2)
AI Provenance Mark · https://aipmq.org/1.2/spec/

Generate AIPM 1.2 provenance URLs, compress payloads, and hash files.
No external dependencies — stdlib only (hashlib, zlib, base64, urllib).

Documentation:
  Specification:     https://aipmq.org/1.2/spec/
  Implementor guide: https://aipmq.org/1.2/guide/

This file is version-locked to AIPM 1.2. Future AIPM versions will have their own
reference implementation in their respective version directories.

Hash format (AIPM 1.2)
-----------------------
Content hashes (hs, hd) use W3C SRI format: "algo-base64url"
  e.g. "sha256-uU0nuZNNPgilLlLX2n2r-sSE7-N6U4DukIj3rOLvzek" (50 chars)
This makes the algorithm explicit and is forward-compatible with future algorithms
(sha3-256, sha512, etc.) without changing parameter names.

sha256_file() and sha256_bytes() return SRI format by default.
build_aipm_url() also accepts legacy hex (64 chars) and plain base64url (43 chars)
and normalises them to SRI format automatically.

Usage examples
--------------
# Generate a plain URL
url = build_aipm_url(role="prompted+reviewed", model="Claude Sonnet 4.6",
                     ctx="Internal memo draft", date="2026-05-04")

# Generate with compression (auto-triggers above ~200 ctx chars)
url = build_aipm_url(role="prompted+reviewed", ctx="A very long prompt...")

# Generate with file hashes (returns SRI format automatically)
hs = sha256_file("article.pdf")
hd = sha256_file("context.md")
url = build_aipm_url(role="prompted+reviewed", src="https://example.com/article.pdf",
                     hs=hs, doc="https://gist.github.com/user/abc", hd=hd)

# Generate for automated pipeline
url = build_aipm_url(role="automated", model="GPT-4o", type_="text",
                     org="Acme Publishing", ctx="Automated news brief")
"""

import zlib
import base64
import json
import hashlib
import urllib.parse
from datetime import datetime, timezone
from typing import Optional

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

BASE_URL = "https://aipmq.org/1.2/aipm/"
VERSION = "1.2"
COMPRESS_AUTO_THRESHOLD = 200   # ctx chars above which auto-compression fires
QR_BYTE_LIMIT = 1000            # conservative limit — qrcodejs 1.0.0 underestimates
                                 # version for binary-mode base64url URLs

VALID_ROLES = {
    "all", "wrote", "prompted+edited", "prompted+reviewed", "prompted",
    "edited", "reviewed", "supervised", "automated",
}

VALID_TYPES = {"text", "code", "image", "audio", "video", "multimodal"}


# ---------------------------------------------------------------------------
# Hash utilities — W3C SRI format: "algo-base64url"
# ---------------------------------------------------------------------------

def _buf_to_base64url(digest_bytes: bytes) -> str:
    """Convert raw hash bytes to base64url (no padding)."""
    b64 = base64.b64encode(digest_bytes)
    return b64.replace(b"+", b"-").replace(b"/", b"_").rstrip(b"=").decode("ascii")


def _hex_to_base64url(hex_str: str) -> str:
    """Convert a hex digest string to base64url."""
    return _buf_to_base64url(bytes.fromhex(hex_str))


def normalize_hash(value: str) -> str:
    """Normalise any hash input to canonical W3C SRI format 'algo-base64url'.

    Accepts:
        - SRI format:      "sha256-uU0nuZNNPgilL..."  → returned as-is
        - Plain base64url: "uU0nuZNNPgilLlLX..."      → prefixed with "sha256-"
        - Hex digest:      "b94d27b9934d3e08..."       → converted to SRI

    Args:
        value: Hash value in any accepted format.

    Returns:
        Hash in SRI format, e.g. "sha256-uU0nuZNNPgilLlLX2n2r-sSE7-N6U4DukIj3rOLvzek".
    """
    v = value.strip()
    # Already SRI: "algo-base64url"
    if "-" in v:
        parts = v.split("-", 1)
        if len(parts[0]) > 0 and len(parts[1]) >= 43:
            return v
    # Hex: 64 lowercase/uppercase hex chars
    if len(v) == 64 and all(c in "0123456789abcdefABCDEF" for c in v):
        return "sha256-" + _hex_to_base64url(v)
    # Plain base64url: 43 chars
    if len(v) == 43:
        return "sha256-" + v
    return v  # unknown format — pass through


# ---------------------------------------------------------------------------
# File hashing — returns W3C SRI format
# ---------------------------------------------------------------------------

def sha256_file(path: str, chunk_size: int = 1 << 20) -> str:
    """Compute the SHA-256 digest of a file's raw bytes in W3C SRI format.

    Uses chunked reading so large files don't exhaust memory.
    Result matches the browser's crypto.subtle.digest('SHA-256') on the same file.

    Args:
        path: Path to the file.
        chunk_size: Read chunk size in bytes (default 1 MB).

    Returns:
        SRI-format string: "sha256-<base64url>" (50 chars, e.g.
        "sha256-uU0nuZNNPgilLlLX2n2r-sSE7-N6U4DukIj3rOLvzek").

    Example:
        hs = sha256_file("article.pdf")
        hd = sha256_file("context.md")
        url = build_aipm_url(role="prompted+reviewed", hs=hs, hd=hd)
    """
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while chunk := f.read(chunk_size):
            h.update(chunk)
    return "sha256-" + _buf_to_base64url(h.digest())


def sha256_bytes(data: bytes) -> str:
    """Compute the SHA-256 digest of a bytes object in W3C SRI format.

    Args:
        data: Raw bytes to hash.

    Returns:
        SRI-format string: "sha256-<base64url>" (50 chars).
    """
    return "sha256-" + _buf_to_base64url(hashlib.sha256(data).digest())


# ---------------------------------------------------------------------------
# Compression (deflate-raw + base64url — matches browser implementation)
# ---------------------------------------------------------------------------

def compress_payload(obj: dict) -> str:
    """Compress a metadata dict to a base64url deflate-raw string.

    Matches the browser's CompressionStream('deflate-raw') + base64url encoding
    used by the AIPM 1.2 generator.

    Args:
        obj: Metadata dict (all string/int values, no nulls).

    Returns:
        base64url-encoded compressed string (no padding).
    """
    raw = json.dumps(obj, separators=(",", ":")).encode("utf-8")
    # wbits=-15 → raw deflate (no zlib header), matching deflate-raw
    compressed = zlib.compress(raw, level=9, wbits=-15)
    b64 = base64.b64encode(compressed)
    # base64url: replace + → -, / → _, strip =
    return b64.replace(b"+", b"-").replace(b"/", b"_").rstrip(b"=").decode("ascii")


def decompress_payload(z: str) -> dict:
    """Decompress a base64url deflate-raw payload back to a metadata dict.

    Args:
        z: The value of the z= URL parameter.

    Returns:
        Metadata dict.

    Raises:
        ValueError: If decompression or JSON parsing fails.
    """
    b64 = z.replace("-", "+").replace("_", "/")
    pad = 4 - len(b64) % 4
    if pad != 4:
        b64 += "=" * pad
    try:
        compressed = base64.b64decode(b64)
        raw = zlib.decompress(compressed, wbits=-15)
        return json.loads(raw.decode("utf-8"))
    except Exception as e:
        raise ValueError(f"Failed to decompress AIPM payload: {e}") from e


# ---------------------------------------------------------------------------
# URL builder
# ---------------------------------------------------------------------------

def build_aipm_url(
    role: str,
    *,
    model: str = "",
    ctx: str = "",
    date: str = "",
    show: bool = True,
    src: str = "",
    hs: str = "",
    type_: str = "",
    doc: str = "",
    hd: str = "",
    author: str = "",
    org: str = "",
    prev: str = "",
    lang: str = "",
    base: str = BASE_URL,
    force_compress: Optional[bool] = None,
) -> str:
    """Build an AIPM 1.2 provenance URL.

    Args:
        role: Human role (required). One of: all, wrote, prompted+edited,
              prompted+reviewed, prompted, edited, reviewed, supervised, automated.
        model: AI model or system name (e.g. "Claude Sonnet 4.6").
        ctx: Context, prompt, or purpose description.
        date: ISO 8601 date or datetime (e.g. "2026-05-04" or
              "2026-05-04T14:30-04:00"). Defaults to current UTC datetime.
        show: Include role abbreviation in QR mark (Show mode). Default True.
        src: Content URL (HTTPS only).
        hs: SHA-256 hash of the src file. Accepts SRI format (sha256-base64url),
            plain base64url (43 chars), or legacy hex (64 chars). Use sha256_file().
        type_: Content type: text, image, audio, video, or multimodal.
        doc: Full Context Document URL (HTTPS only).
        hd: SHA-256 hash of the doc file. Same format options as hs.
        author: Individual human creator name or identifier.
        org: Institution or organization.
        prev: Previous AIPM record URL.
        lang: BCP 47 language tag (e.g. "en", "zh-Hans").
        base: Base URL. Override for self-hosted instances.
        force_compress: Force compression on/off. None = auto (recommended).

    Returns:
        Full AIPM 1.2 URL with hash fragment.

    Raises:
        ValueError: If role or type_ are not valid values.
    """
    if role not in VALID_ROLES:
        raise ValueError(
            f"Invalid role: {role!r}. "
            f"Valid values: {', '.join(sorted(VALID_ROLES))}"
        )
    if type_ and type_ not in VALID_TYPES:
        raise ValueError(
            f"Invalid type: {type_!r}. "
            f"Valid values: {', '.join(sorted(VALID_TYPES))}"
        )

    # Normalise hashes to SRI format (accept hex, plain b64url, or SRI)
    if hs: hs = normalize_hash(hs)
    if hd: hd = normalize_hash(hd)

    # Default date to current UTC datetime with offset
    if not date:
        now = datetime.now(timezone.utc)
        date = now.strftime("%Y-%m-%dT%H:%M+00:00")

    # Decide whether to compress
    auto_compress = len(ctx) > COMPRESS_AUTO_THRESHOLD
    use_compress = force_compress if force_compress is not None else auto_compress

    if use_compress:
        payload: dict = {"v": VERSION}
        if model:  payload["model"]  = model
        if role:   payload["role"]   = role
        if date:   payload["date"]   = date
        if show:   payload["show"]   = 1
        if ctx:    payload["ctx"]    = ctx
        if lang:   payload["lang"]   = lang
        if src:    payload["src"]    = src
        if hs:     payload["hs"]     = hs
        if type_:  payload["type"]   = type_
        if doc:    payload["doc"]    = doc
        if hd:     payload["hd"]     = hd
        if author: payload["author"] = author
        if org:    payload["org"]    = org
        if prev:   payload["prev"]   = prev

        z = compress_payload(payload)
        url = base.rstrip("/") + "/#z=" + z

        if len(url.encode("utf-8")) > QR_BYTE_LIMIT:
            url += "&qr=0"

        return url

    # Plain (uncompressed) URL
    params = urllib.parse.urlencode(
        {k: v for k, v in [
            ("v",      VERSION),
            ("model",  model),
            ("role",   role),
            ("date",   date),
            ("show",   "1" if show else ""),
            ("ctx",    ctx),
            ("lang",   lang),
            ("src",    src),
            ("hs",     hs),
            ("type",   type_),
            ("doc",    doc),
            ("hd",     hd),
            ("author", author),
            ("org",    org),
            ("prev",   prev),
        ] if v},
        quote_via=urllib.parse.quote,
    )

    url = base.rstrip("/") + "/#" + params
    if len(url.encode("utf-8")) > QR_BYTE_LIMIT:
        url += "&qr=0"

    return url


# ---------------------------------------------------------------------------
# URL parsing
# ---------------------------------------------------------------------------

def parse_aipm_url(url: str) -> dict:
    """Parse an AIPM URL (any version, plain or compressed) into a metadata dict.

    Args:
        url: Any AIPM URL including the hash fragment.

    Returns:
        Dict with all present fields. Decompresses z= payload automatically.

    Raises:
        ValueError: If the URL has no hash fragment or decompression fails.
    """
    parsed = urllib.parse.urlparse(url)
    fragment = parsed.fragment
    if not fragment:
        raise ValueError("URL has no hash fragment — not a valid AIPM URL.")

    params = urllib.parse.parse_qs(fragment, keep_blank_values=False)
    flat = {k: v[0] for k, v in params.items()}

    if "z" in flat:
        data = decompress_payload(flat["z"])
        if flat.get("qr") == "0":
            data["qr"] = 0
        return data

    result: dict = {}
    for key in ("v", "model", "role", "date", "ctx", "lang", "src", "hs",
                "type", "doc", "hd", "author", "org", "prev"):
        if key in flat:
            result[key] = flat[key]
    if "show" in flat:
        result["show"] = 1 if flat["show"] == "1" else 0
    if "qr" in flat:
        result["qr"] = flat["qr"]
    return result


# ---------------------------------------------------------------------------
# CLI — basic usage
# ---------------------------------------------------------------------------

if __name__ == "__main__":
    import sys
    import argparse

    parser = argparse.ArgumentParser(
        description="AIPM 1.2 — generate provenance URLs and hash files",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s generate --role prompted+reviewed --model "Claude Sonnet 4.6" --ctx "Blog post draft"
  %(prog)s generate --role automated --model GPT-4o --type text --org "Acme Corp"
  %(prog)s hash article.pdf
  %(prog)s hash context.md article.pdf
  %(prog)s parse "https://aipmq.org/1.2/aipm/#v=1.2&role=prompted%2Breviewed&..."

Hash output format:
  sha256_file() and sha256_bytes() output W3C SRI format: "sha256-<base64url>"
  e.g. "sha256-uU0nuZNNPgilLlLX2n2r-sSE7-N6U4DukIj3rOLvzek"
  The 'hash' subcommand outputs the same format.
  Legacy hex (64 chars) passed to --hs / --hd is automatically normalised.

Documentation:
  Specification:      https://aipmq.org/1.2/spec/
  Implementor guide:  https://aipmq.org/1.2/guide/
  Generate a mark:    https://aipmq.org/1.2/generate/

This is the AIPM 1.2 reference implementation. Each version of AIPM has its own
aipm.py — this file implements version 1.2 only and will not be updated for 1.3+.
""",
    )
    sub = parser.add_subparsers(dest="command")

    gen = sub.add_parser("generate", help="Generate an AIPM URL")
    gen.add_argument("--role",   required=True, help=f"Human role. One of: {', '.join(sorted(VALID_ROLES))}")
    gen.add_argument("--model",  default="")
    gen.add_argument("--ctx",    default="", help="Context, prompt, or purpose")
    gen.add_argument("--date",   default="", help="ISO 8601 date or datetime (default: now)")
    gen.add_argument("--no-show", action="store_true", help="Disable Show mode")
    gen.add_argument("--src",    default="", help="Content URL")
    gen.add_argument("--hs",     default="", help="SHA-256 hash of src file (SRI, base64url, or hex)")
    gen.add_argument("--type",   default="", dest="type_", help=f"Content type. One of: {', '.join(sorted(VALID_TYPES))}")
    gen.add_argument("--doc",    default="", help="Full Context Document URL")
    gen.add_argument("--hd",     default="", help="SHA-256 hash of doc file (SRI, base64url, or hex)")
    gen.add_argument("--author", default="")
    gen.add_argument("--org",    default="")
    gen.add_argument("--prev",   default="", help="Previous AIPM record URL")
    gen.add_argument("--lang",   default="", help="BCP 47 language tag")
    gen.add_argument("--base",   default=BASE_URL, help="Base URL (for self-hosting)")
    gen.add_argument("--compress", action="store_true", help="Force compression")
    gen.add_argument("--no-compress", action="store_true", help="Force no compression")

    hash_cmd = sub.add_parser("hash", help="Compute SHA-256 SRI hash of one or more files")
    hash_cmd.add_argument("files", nargs="+", help="Files to hash")

    parse_cmd = sub.add_parser("parse", help="Parse an AIPM URL into its fields")
    parse_cmd.add_argument("url", help="AIPM URL to parse")

    args = parser.parse_args()

    if args.command is None:
        parser.print_help()
        sys.exit(0)

    elif args.command == "generate":
        force = None
        if args.compress: force = True
        if args.no_compress: force = False
        url = build_aipm_url(
            role=args.role, model=args.model, ctx=args.ctx,
            date=args.date, show=not args.no_show,
            src=args.src, hs=args.hs, type_=args.type_,
            doc=args.doc, hd=args.hd, author=args.author,
            org=args.org, prev=args.prev, lang=args.lang,
            base=args.base, force_compress=force,
        )
        print(url)

    elif args.command == "hash":
        for path in args.files:
            digest = sha256_file(path)
            print(f"{digest}  {path}")

    elif args.command == "parse":
        data = parse_aipm_url(args.url)
        for k, v in sorted(data.items()):
            print(f"{k}: {v}")
