""" Functions to read and write binary representation of archive data.

    Copyright (C) 2009 Darrell Karbott

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public
    License as published by the Free Software Foundation; either
    version 2.0 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

    Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
"""

# REDFLAG: Only tested on x86 32-bit Intel Linux. Alignment/endedness issues?
# REDFLAG: OK to read/write byte strings directly w/o (un)pack'ing, right?
# REDFLAG: REDUCE RAM: do chunked read/writes/hash digests where possible.

import struct
from binascii import hexlify
from hashlib import sha1

NULL_SHA = '\x00' * 20

LINK_HEADER_FMT = '!LL20s'
LINK_HEADER_LEN = struct.calcsize(LINK_HEADER_FMT)

COUNT_FMT = "!L"
COUNT_LEN = struct.calcsize(COUNT_FMT)

# REDFLAG: doc <16k name length
MANIFEST_ENTRY_HDR_FMT = "!H20s20s"
MANIFEST_ENTRY_HDR_LEN = struct.calcsize(MANIFEST_ENTRY_HDR_FMT)
MANIFEST_ENTRY_FMT = MANIFEST_ENTRY_HDR_FMT + "%is"

MSG_INCOMPLETE_READ = "Bad stream, EOF during read."

READ_CHUNK_LEN = 1024 * 16

def str_sha(raw_sha):
    """ Return a 12 digit hex string for a raw SHA1 hash. """
    return hexlify(raw_sha)[:12]

# Used to catch pilot error which otherwise shows up as weird failures.
def check_shas(raw_sha_sequence):
    """ INTERNAL: Raise a ValueError if the sequence values don't look like
        raw SHA1 hashes. """
    if raw_sha_sequence is None:
        raise ValueError("SHA1 has sequence is None?")
    for value in raw_sha_sequence:
        if value is None:
            raise ValueError("None instead of binary SHA1 digest")

        if not len(value) == 20:
            raise ValueError("Doesn't look like a binary SHA1 digest: %s" %
                             repr(value))

def checked_read(in_stream, length, allow_eof=False):
    """ Read a fixed number of bytes from an open input stream.

        Raises IOError if EOF is encountered before all bytes are read.
    """

    bytes = in_stream.read(length)
    if allow_eof and bytes == '':
        return bytes
    if len(bytes) != length:
        raise IOError(MSG_INCOMPLETE_READ)
    return bytes

# Wire rep:
# <total length><age><parent><blob data>
#
# Python rep
#  0     1    2       3     4              5             6
# (sha1, age, parent, data, stream_offset, stream_index, physical_len)
#
# sha1 is hash of parent + data
# physical_len is the number of bytes of storage used to persist
# the link.
def read_link(in_stream, keep_data=True, pos=None, stream_index=None):
    """ Read a single history link from an open stream. """

    bytes = checked_read(in_stream, LINK_HEADER_LEN, True)
    if bytes == '':
        return None # Clean EOF

    length, age, parent = struct.unpack(LINK_HEADER_FMT, bytes)
    payload_len = length - LINK_HEADER_LEN # already read header
    raw = checked_read(in_stream, payload_len)

    # READFLAG: incrementally read / hash
    sha_value = sha1(str(age))
    sha_value.update(parent)
    sha_value.update(raw)

    if not keep_data:
        raw = None

    return (sha_value.digest(), age, parent, raw,
            pos, stream_index, payload_len)


def copy_raw_links(in_stream, out_stream, allowed_shas, copied_shas):
    """ Copy any links with SHA1 hashes in allowed_shas from in_instream to
        out_stream.
    """
    count = 0
    while True:
        hdr = checked_read(in_stream, LINK_HEADER_LEN, True)
        if hdr == '':
            return count # Clean EOF
        length, age, parent = struct.unpack(LINK_HEADER_FMT, hdr)
        sha_value = sha1(str(age))
        sha_value.update(parent)
        rest = checked_read(in_stream, length - LINK_HEADER_LEN)
        sha_value.update(rest)
        value = sha_value.digest()
        if value in copied_shas:
            continue # Only copy once.

        if allowed_shas is None or value in allowed_shas:
            out_stream.write(hdr)
            out_stream.write(rest)
            count += 1
            copied_shas.add(value)

# Sets pos, but caller must fix stream index
def write_raw_link(out_stream, age, parent, raw_file, stream_index):
    """ Write a history link to an open stream.

        Returns a history link tuple for the link written. """

    assert len(parent) == 20 # Raw, not hex string

    pos = out_stream.tell()
    in_file = open(raw_file, 'rb')
    try:
        raw = in_file.read()

        out_stream.write(struct.pack(LINK_HEADER_FMT,
                                     len(raw) + LINK_HEADER_LEN,
                                     age,
                                     parent))

        sha_value = sha1(str(age))
        sha_value.update(parent)

        out_stream.write(raw)
        # REDFLAG: read / hash incrementally
        sha_value.update(raw)
    finally:
        in_file.close()

    return (sha_value.digest(), age, parent, None,
            pos, stream_index, len(raw) + LINK_HEADER_LEN)

def write_file_manifest(name_map, out_stream):
    """ Write file manifest data to an open stream. """

    out_stream.write(struct.pack(COUNT_FMT, len(name_map)))
    # Sort to make it easier for diff algos to find contiguous
    # changes.
    names = name_map.keys()
    names.sort()
    for name in names:
        length = MANIFEST_ENTRY_HDR_LEN + len(name)
        file_sha, history_sha = name_map[name]

        out_stream.write(struct.pack(MANIFEST_ENTRY_FMT % len(name),
                                     length,
                                     file_sha,
                                     history_sha,
                                     name))
def read_file_manifest(in_stream):
    """ Read file manifest data from an open input stream. """
    count = struct.unpack(COUNT_FMT, checked_read(in_stream, COUNT_LEN))[0]
    name_map = {}
    for dummy in range(0, count):
        length, file_sha, history_sha = \
                struct.unpack(MANIFEST_ENTRY_HDR_FMT,
                              checked_read(in_stream,
                                           MANIFEST_ENTRY_HDR_LEN))

        length -= MANIFEST_ENTRY_HDR_LEN
        name = checked_read(in_stream, length)

        assert not name in name_map
        name_map[name] = (file_sha, history_sha)
    return name_map

def manifest_to_file(file_name, name_map):
    """ Write a single manifest to a file. """
    out_file = open(file_name, 'wb')
    try:
        write_file_manifest(name_map, out_file)
    finally:
        out_file.close()

def manifest_from_file(file_name):
    """ Read a single manifest from a file. """
    in_file = open(file_name, 'rb')
    try:
        return read_file_manifest(in_file)
    finally:
        in_file.close()

def get_file_sha(full_path):
    """ Return the 20 byte sha1 hash digest of a file. """
    in_file = open(full_path, 'rb')
    try:
        # Bug: why doesn't this use sha_func?
        sha_value = sha1()
        while True:
            bytes = in_file.read(READ_CHUNK_LEN)
            if bytes == "":
                break
            sha_value.update(bytes)
        return sha_value.digest()
    finally:
        in_file.close()