infocalypse: wormarc/linkmap.py

""" A class to keep track of history links stored in a set of files.

    Copyright (C) 2009 Darrell Karbott

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public
    License as published by the Free Software Foundation; either
    version 2.0 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

    Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
"""

from binaryrep import read_link, str_sha

class LinkMap(dict):
    """ A history link hash addressable index of the history links in
        a set of block files. """
    def __init__(self):
        dict.__init__(self)
        self.files = []

    def read(self, file_list, keep_data=False):
        """ Read the index from a collection of block files. """
        counts = [0 for dummy in range(0, len(file_list))]
        age = 0 # Hmmmm
        for index, name in enumerate(file_list):
            in_stream = open(name, 'rb')
            raised = True
            try:
                latest_age, count = self.read_from_stream(in_stream,
                                                          index, keep_data)
                age = max(age, latest_age)
                counts[index] = count
                raised = False
            finally:
                if raised or keep_data:
                    in_stream.close()
                else:
                    self.files.append(in_stream)
        return age, tuple(counts)


    def read_from_stream(self, in_stream, index, keep_data=False):
        """ Read links from a stream. """
        age = 0
        count = 0
        while True:
            link = read_link(in_stream, keep_data, in_stream.tell(),
                             index)
            if link is None:
                break

            age = max(age, link[1])
            prev = list(self.get(link[0], []))
            link = list(link) # REDFLAG: ??? tuple -> list -> tuple
            prev.append(tuple(link))
            self[link[0]] = tuple(prev)
            count += 1

        return age, count

    # SLOW, get rid of list copy?
    # fixups is a old_index -> new index map
    # Omit from fixups == delete
    def _update_block_ordinals(self, fixups):
        """ INTERNAL: Implementation helper for update_blocks(). """
        for sha_hash in self.keys():
            prev = self.get(sha_hash)
            updated = []
            for link in prev:
                assert link[0] == sha_hash
                if not link[5] in fixups:
                    continue # Dropped block
                link = list(link)
                link[5] = fixups[link[5]]
                updated.append(tuple(link))
            if len(updated) > 0:
                self[sha_hash] = tuple(updated)
            else:
                del self[sha_hash]

    # Fixes ordinals in referenced links
    # Drops omited blocks
    # Closes and re-opens all file streams.
    # Loads links from the streams in new_indices.
    def update_blocks(self, fixups, file_list, new_indices, keep_data=False):
        """ Update the index to track addition, deletion and reordering of
        the underlying block files. """

        assert len(self.files) == 0 # must be closed.
        self._update_block_ordinals(fixups)
        self.files = []
        age = 0
        raised = True
        try:
            for index, name in enumerate(file_list):
                self.files.append(open(name, 'rb'))
                if not index in new_indices:
                    continue

                # Need to read links out of the new file.
                latest_age, dummy = self.read_from_stream(self.files[index],
                                                          index, keep_data)
                age = max(age, latest_age)
            raised = False
            return age
        finally:
            if raised:
                self.close()

    def close(self):
        """ Close the index. """
        for in_file in self.files:
            in_file.close()
        self.files = []

    def get_link(self, link_sha, need_data=False):
        """ Get a history link by its sha1 hash. """
        links = self.get(link_sha, None)
        if links is None:
            raise IOError("Unresolved link: " + str_sha(link_sha))

        assert len(links) > 0
        # REDFLAG: Fully think through.
        # The same link can exist in multiple files.
        link = links[0]

        if (not need_data) or (not link[3] is None):
            return link

        index = link[5]
        self.files[index].seek(link[4])
        ret = read_link(self.files[index], True)
        if ret is None:
            raise IOError("Couldn't read blob from disk.")

        assert ret[0] == link[0]
        assert ret[1] == link[1]
        assert ret[2] == link[2]
        assert not ret[3] is None
        assert ret[0] ==  link_sha
        return ret

def raw_block_read(link_map, ordinal):
    """ Read a single block file. """
    table = {}
    in_stream = link_map.files[ordinal]
    in_stream.seek(0)
    while True:
        start_pos = in_stream.tell()
        link = read_link(in_stream, False, start_pos, ordinal)
        # read_link() never returns None except for eof, right?
        # Otherwise we'd only do a partial read...
        if link is None:
            break
        entry = table.get(link[0], [])
        entry.append(link)
        table[link[0]] = entry
    return table

def links_by_block(link_map):
    """ INTERNAL: Implementation helper function for
        verify_link_map(). """
    tables = [{} for dummy in range(0, len(link_map.files))]
    for links in link_map.values():
        assert len(links) > 0
        for link in links:
            ordinal = link[5]
            assert ordinal >= 0 and ordinal < len(link_map.files)
            entry = tables[ordinal].get(link[0], [])
            entry.append(link)
            tables[ordinal][link[0]] = entry
    return tables

def verify_link_map(link_map):
    """ Debugging function to verify the integrity of a LinkMap instance. """

    assert len(link_map.files) > 0
    count = 0
    by_block = links_by_block(link_map)

    for ordinal in range(0, len(link_map.files)):
        raw_shas = raw_block_read(link_map, ordinal)
        # Hashes read from the raw file are the same as
        # the ones that the LinkMap thinks should be in the file.
        assert frozenset(raw_shas.keys()) == frozenset(by_block[ordinal].keys())

        # Now check values.
        for link_sha in raw_shas:
            assert (frozenset(raw_shas[link_sha]) ==
                    frozenset(by_block[ordinal][link_sha]))
            count += 1
    return count