#!/usr/bin/env python3
"""
zip_text_roundtrip.py

Convert a project ZIP file into a single labeled text file, and back again.

Usage examples
--------------

# ZIP -> text (for sending to an LLM)
python zip_text_roundtrip.py zip2txt project.zip project.txt --binary-mode skip

# ZIP -> text, but include all binary files as base64 so it can be fully reconstructed
python zip_text_roundtrip.py zip2txt project.zip project.txt --binary-mode base64

# Text (produced by this script) -> ZIP
python zip_text_roundtrip.py txt2zip project.txt project_restored.zip

Notes
-----
- The text format is line-oriented and meant to be reasonably readable.
- For full round-trip fidelity (including binaries), use --binary-mode base64.
  If you choose --binary-mode skip, any skipped binaries obviously cannot be
  reconstructed later.
"""

from __future__ import annotations

import argparse
import base64
import textwrap
from pathlib import Path
import zipfile
from typing import Tuple, Optional

# Similar binary check heuristic to combine_folder.py
def is_probably_binary_bytes(sample: bytes) -> bool:
    """
    Heuristic: if the data contains a null byte, treat as binary.
    Otherwise, try UTF-8 decode; if it fails hard, likely binary.
    """
    if b"\x00" in sample:
        return True
    try:
        sample.decode("utf-8")
        return False
    except Exception:
        return True


FILE_START = "===== FILE START ====="
FILE_CONTENT_START = "===== FILE CONTENT START ====="
FILE_CONTENT_END = "===== FILE CONTENT END ====="

# We always write these five header lines after FILE_START:
# PATH: <path>
# TYPE: text|binary-base64|binary-as-text|binary-skipped
# SIZE: <int>
# ENCODING: <encoding name (for text / binary-as-text)>
# ENDS_WITH_NEWLINE: yes|no
HEADER_KEYS = ("PATH", "TYPE", "SIZE", "ENCODING", "ENDS_WITH_NEWLINE")


def zip_to_text(
    zip_path: Path,
    txt_path: Path,
    binary_mode: str = "skip",
    text_encoding: str = "utf-8",
) -> int:
    """
    Convert a ZIP file into a single text file.

    Returns the number of files written.
    """
    count = 0
    zip_path = zip_path.resolve()
    txt_path = txt_path.resolve()
    txt_path.parent.mkdir(parents=True, exist_ok=True)

    with zipfile.ZipFile(zip_path, "r") as zf, txt_path.open(
        "w", encoding="utf-8", newline="\n"
    ) as out:
        out.write(f"# Generated from ZIP: {zip_path.name}\n")
        out.write("# Do not change the structural markers unless you know what you're doing.\n\n")

        # Sort entries to get deterministic order
        infos = sorted(
            (i for i in zf.infolist() if not i.is_dir()),
            key=lambda info: info.filename.lower(),
        )

        for info in infos:
            data = zf.read(info.filename)
            size = len(data)
            # Decide if binary
            is_bin = is_probably_binary_bytes(data[:2048])

            ends_with_newline = "no"
            content_lines = []

            if is_bin:
                if binary_mode == "skip":
                    ftype = "binary-skipped"
                    content_lines = ["[[BINARY CONTENT SKIPPED]]"]
                elif binary_mode == "base64":
                    ftype = "binary-base64"
                    b64 = base64.b64encode(data).decode("ascii")
                    # Wrap to nice line lengths so LLMs don't choke
                    content_lines = textwrap.wrap(b64, width=76)
                elif binary_mode == "text":
                    ftype = "binary-as-text"
                    text = data.decode(text_encoding, errors="replace")
                    ends_with_newline = "yes" if text.endswith("\n") else "no"
                    content_lines = text.splitlines()
                else:
                    raise ValueError(f"Unknown binary_mode: {binary_mode}")
            else:
                ftype = "text"
                text = data.decode(text_encoding, errors="replace")
                ends_with_newline = "yes" if text.endswith("\n") else "no"
                content_lines = text.splitlines()

            # Write header
            out.write(FILE_START + "\n")
            out.write(f"PATH: {info.filename}\n")
            out.write(f"TYPE: {ftype}\n")
            out.write(f"SIZE: {size}\n")
            out.write(f"ENCODING: {text_encoding}\n")
            out.write(f"ENDS_WITH_NEWLINE: {ends_with_newline}\n")
            out.write(FILE_CONTENT_START + "\n")

            # Write content lines exactly as lines, then closing marker
            for line in content_lines:
                out.write(line + "\n")
            out.write(FILE_CONTENT_END + "\n\n")
            count += 1

    return count


def _parse_header_line(line: str) -> Tuple[str, str]:
    """
    Parse a header line of the form "KEY: value" and return (key, value).
    """
    if ":" not in line:
        raise ValueError(f"Invalid header line (no colon): {line!r}")
    key, value = line.split(":", 1)
    return key.strip(), value.strip()


def text_to_zip(
    txt_path: Path,
    zip_path: Path,
) -> int:
    """
    Convert a text file produced by zip_to_text() back into a ZIP.

    Returns the number of files written.
    """
    txt_path = txt_path.resolve()
    zip_path = zip_path.resolve()
    zip_path.parent.mkdir(parents=True, exist_ok=True)

    count = 0
    with txt_path.open("r", encoding="utf-8") as inp, zipfile.ZipFile(
        zip_path, "w", compression=zipfile.ZIP_DEFLATED
    ) as zf:
        while True:
            line = inp.readline()
            if not line:
                break  # EOF
            line = line.rstrip("\n")

            # Skip anything until we hit a FILE_START marker
            if line != FILE_START:
                continue

            # We are at the start of a file block; read header lines
            headers = {}
            for expected_key in HEADER_KEYS:
                header_line = inp.readline()
                if not header_line:
                    raise ValueError("Unexpected EOF while reading header")
                header_line = header_line.rstrip("\n")
                key, value = _parse_header_line(header_line)
                if key != expected_key:
                    raise ValueError(
                        f"Unexpected header key. Expected {expected_key!r}, got {key!r}"
                    )
                headers[key] = value

            # Next line must be FILE_CONTENT_START
            marker_line = inp.readline()
            if not marker_line:
                raise ValueError("Unexpected EOF before FILE_CONTENT_START")
            if marker_line.rstrip("\n") != FILE_CONTENT_START:
                raise ValueError(
                    f"Expected {FILE_CONTENT_START!r}, got {marker_line!r}"
                )

            # Collect all content lines until FILE_CONTENT_END
            content_lines = []
            for raw in inp:
                stripped = raw.rstrip("\n")
                if stripped == FILE_CONTENT_END:
                    break
                content_lines.append(stripped)
            else:
                # Loop finished without hitting END marker
                raise ValueError(f"Missing {FILE_CONTENT_END} for file block")

            rel_path = headers["PATH"]
            ftype = headers["TYPE"]
            encoding = headers["ENCODING"] or "utf-8"
            ends_flag = headers.get("ENDS_WITH_NEWLINE", "no").lower() == "yes"

            if ftype == "binary-base64":
                # Join, strip whitespace, decode base64
                b64_str = "".join(line.strip() for line in content_lines)
                data = base64.b64decode(b64_str.encode("ascii"))
            elif ftype in ("text", "binary-as-text"):
                # Re-encode text content and optionally restore trailing newline
                if content_lines:
                    text = "\n".join(content_lines)
                else:
                    text = ""
                if ends_flag:
                    text = text + "\n"
                data = text.encode(encoding, errors="replace")
            elif ftype == "binary-skipped":
                # We don't have the original binary; create an empty placeholder file.
                data = b""
            else:
                raise ValueError(f"Unknown TYPE in header: {ftype!r}")

            # Write into the new zip
            zf.writestr(rel_path, data)
            count += 1

    return count


def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description=(
            "Convert a project ZIP file into a single labeled text file, and back again.\n\n"
            "Subcommands:\n"
            "  zip2txt: ZIP -> text\n"
            "  txt2zip: text -> ZIP\n"
        ),
        formatter_class=argparse.RawDescriptionHelpFormatter,
    )

    sub = p.add_subparsers(dest="command", required=True)

    p_zip2txt = sub.add_parser(
        "zip2txt", help="Convert a ZIP into a single labeled text file."
    )
    p_zip2txt.add_argument("zip", type=Path, help="Input ZIP file path")
    p_zip2txt.add_argument("text", type=Path, help="Output text file path")
    p_zip2txt.add_argument(
        "--binary-mode",
        choices=["skip", "base64", "text"],
        default="skip",
        help=(
            "How to include binary files in the text:\n"
            "  skip   - just note that the file was skipped (default, smaller text).\n"
            "  base64 - embed as base64; use this if you want full round-trip.\n"
            "  text   - best-effort text decode (may mangle bytes).\n"
        ),
    )
    p_zip2txt.add_argument(
        "--encoding",
        default="utf-8",
        help="Text encoding for decoding source files. Default: utf-8",
    )

    p_txt2zip = sub.add_parser(
        "txt2zip", help="Reconstruct a ZIP from a text file produced by zip2txt."
    )
    p_txt2zip.add_argument("text", type=Path, help="Input text file")
    p_txt2zip.add_argument("zip", type=Path, help="Output ZIP file")

    return p.parse_args()


def main() -> None:
    args = parse_args()
    if args.command == "zip2txt":
        n = zip_to_text(
            zip_path=args.zip,
            txt_path=args.text,
            binary_mode=args.binary_mode,
            text_encoding=args.encoding,
        )
        print(f"Wrote {n} file(s) from {args.zip} into {args.text}")
    elif args.command == "txt2zip":
        n = text_to_zip(
            txt_path=args.text,
            zip_path=args.zip,
        )
        print(f"Wrote {n} file(s) into {args.zip}")
    else:
        raise SystemExit("Unknown command")

if __name__ == "__main__":
    main()


# to convert zip files containing an entire project code repository to a single text file:
# python zip_text_roundtrip.py zip2txt my_flask_app.zip my_flask_app.txt --binary-mode skip

# OR to include binary files:
# python zip_text_roundtrip.py zip2txt my_flask_app.zip my_flask_app.txt --binary-mode base64

# to convert text files back to zip:
# python zip_text_roundtrip.py txt2zip my_flask_app_edited.txt my_flask_app_edited.zip