A module to keep a file archive in a bounded number of Write Once Read Many blocks.
diff --git a/wormarc/archive.py b/wormarc/archive.py
new file mode 100644
--- /dev/null
+++ b/wormarc/archive.py
@@ -0,0 +1,404 @@
+""" Classes to maintain an updateable file archive on top of
+ bounded number of WORM (Write Once Read Many) blocks.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+import os
+from binaryrep import NULL_SHA, write_raw_link, check_shas #, str_sha
+from blocknames import BLOCK_SUFFIX, ReadWriteNames
+
+# Just happens to be Freenet block size ;-)
+MIN_BLOCK_LEN = 32 * 1024
+
+MAX_HISTORY_LEN = 16
+
+# 1 effectively causes a full reinsert when history chains are shortened.
+# Larger values favor smaller incremental deltas at the expense of
+# a longer history chain and larger total history size.
+COALESCE_FACTOR = 1.5
+
+#----------------------------------------------------------#
+
+def is_ordered(partitions):
+ """ Return True if the partitions are in ascending order,
+ False otherwise. """
+
+ # Ignore trailing 0 length blocks.
+ lengths = [value[2] for value in partitions]
+ while len(lengths) > 0 and lengths[-1] == 0:
+ lengths = lengths[:-1]
+
+ for index in range (0, len(lengths) - 1):
+ #if lengths[index] >= lengths[index + 1]:
+ if lengths[index] > lengths[index + 1]:
+ return False
+ return True
+
+def is_contiguous(partitions):
+ """ Return True if the block numbers in adjacent
+ partitions are contiguous, False othewise. """
+ if len(partitions) == 0:
+ return True
+
+ if partitions[-1][0] > partitions[-1][1]:
+ return False # Hmmmm...
+
+ for index in range (0, len(partitions) - 1):
+ if partitions[index][0] > partitions[index][1]:
+ return False # Hmmmm...
+ span = partitions[index + 1][0] - partitions[index][1]
+ if span < 0 or span > 1:
+ return False
+
+ return True
+
+# [(start_block, end_block, length), ...]
+def repartition(partitions, multiple=2):
+ """ Merge newest to oldest until
+ len(partition[n-1]) <= multiple * len(partition[n])
+ for all partitions. """
+
+ for index in range (0, len(partitions) - 1):
+ if partitions[index][2] * multiple >= partitions[index + 1][2]:
+ good = partitions[0:index]
+ rest = partitions[index:]
+ # Hmmm... if this is True, maybe you should simplify your rep.???
+ assert rest[1][0] - rest[0][1] >= 0 and rest[1][0] - rest[0][1] < 2
+ rest[1] = (rest[0][0], rest[1][1], rest[0][2] + rest[1][2])
+ rest = rest[1:]
+ ret = good + repartition(rest)
+ assert is_ordered(ret)
+ # Removed this constraint so I can drop empty partions
+ # assert is_contiguous(ret)
+ return ret
+
+ ret = partitions[:] # Hmmmm
+ assert is_ordered(ret)
+ assert is_contiguous(ret)
+ return ret
+
+def compress(partitions, max_len, multiple=2):
+ """ Reduce the length of the partitions to <= max_len.
+
+ Drops zero length partitions. """
+
+ partitions = partitions[:]
+ partitions = [partition for partition in partitions
+ if partition[2] > 0]
+
+ if len(partitions) <= max_len:
+ return partitions
+
+ assert max_len > 1
+ while len(partitions) > max_len:
+ combined = (partitions[0][0], partitions[1][1],
+ partitions[0][2] + partitions[1][2])
+ partitions[1] = combined
+ # Enforce the ordering constraint.
+ partitions = repartition(partitions[1:], multiple)
+
+ assert is_ordered(partitions)
+ return partitions
+
+#----------------------------------------------------------#
+
+class WORMBlockArchive:
+ """ A file archive implemented on top of a bounded length sequence
+ of Write Once Read Many blocks.
+
+ Updating the archive means replacing one or more of the
+ underlying blocks.
+
+ The fundamental atom of storage is a 'history' link. A
+ history link contains an age, the sha1 hash of its parent
+ link, and a blob of delta encoded change data. Age is an
+ integer which is incremented with every update to the
+ archive. History links have at most one parent, but may have
+ many children.
+
+ The archive has an index which maps history link sha1 hashes
+ to history links.
+
+ Files are represented as chains of history links. They are
+ retrieved from the archive by running the delta decoding
+ algorithm over all the patch blobs in the chain. Files are
+ addressable by the sha1 hash of the head link in the history
+ chain. The FileManifest class allows files in the archive to
+ be accessed by human readable names.
+
+ The start_update() method creates a temporary block for update
+ writes. write_new_delta() writes a new history link into the
+ temporary block. commit_update() permanently adds the updates
+ in the temporary block to the archive, re-writing blocks as
+ necessary in order to bound the total number of blocks in the
+ archive at max_blocks.
+
+ There is no explict notion of deleting history links or files
+ but unreferenced history links may be dropped whenever new
+ blocks are created.
+
+ The design for this module was influenced by looking at
+ revlog.py in Mercurial, and to a lesser extent by reading
+ about how git works.
+
+ It was written to implement incrementally updateable file
+ archives on top of Freenet.
+
+ """
+ def __init__(self, delta_coder, blocks):
+ self.delta_coder = delta_coder
+ self.delta_coder.get_data_func = self.get_data
+ self.blocks = blocks
+ self.max_blocks = 4
+ # Hmmm...
+ self.age = 0
+
+ def create(self, block_dir, base_name, overwrite=False ):
+ """ Create a new archive. """
+ names = ReadWriteNames(block_dir, base_name, BLOCK_SUFFIX)
+ self.age = self.blocks.create(names, self.max_blocks, overwrite)
+
+ # Updateable.
+ # LATER: read only???
+ def load(self, block_dir, base_name, tags=None):
+ """ Load an existing archive. """
+ names = ReadWriteNames(block_dir, base_name, BLOCK_SUFFIX)
+ self.age = self.blocks.load(names, self.max_blocks, tags)
+
+ # MUST call this if you called load() or create()
+ def close(self):
+ """ Close the archive. """
+ self.blocks.close()
+
+ # Callback used by DeltaCoder.
+ def get_data(self, link_sha, return_stream=False):
+ """ INTERNAL: Helper function used by DeltaCoder to get raw
+ change data. """
+ assert not return_stream
+ return self.blocks.link_map.get_link(link_sha, True)[3]
+
+ # by head history link sha, NOT file sha
+ def get_file(self, history_sha, out_file):
+ """ Get a file by the sha1 hash of the head link in its
+ history link chain. """
+ check_shas([history_sha, ]) # hmmmm...
+ if history_sha == NULL_SHA:
+ tmp = open(out_file, 'wb')
+ tmp.close()
+ return
+
+ self.delta_coder.apply_deltas(self.blocks.get_history(history_sha),
+ out_file)
+
+ # Hmmmm... too pedantic. how much faster would this run
+ # if it were in BlockStorage?
+ # DESIGN INTENT: BlockStorage shouldn't need to know
+ # about DeltaCoder.
+ def write_new_delta(self, history_sha, new_file):
+ """ Writes a new history link to the update file.
+
+ history_sha can be NULL_SHA.
+
+ Can ignore history. i.e. not link to previous history.
+
+ Returns the new link.
+
+ REQUIRES: is updating.
+ REQUIRES: history_sha is present in the currently committed
+ version of the archive.
+ You CANNOT reference uncommited history links.
+ """
+ check_shas([history_sha, ])
+
+ self.require_blocks()
+ if self.blocks.update_file is None:
+ raise Exception("Not updating.")
+
+ history = self.blocks.get_history(history_sha)
+ tmp_file = self.blocks.tmps.make_temp_file()
+ old_file = self.blocks.tmps.make_temp_file()
+ oldest_delta = self.blocks.tmps.make_temp_file()
+ blob_file = None
+ try:
+ # REDFLAG: Think through.
+ # It would make more sense for the delta coder to decide when to
+ # truncate history, but I don't want to expose the full archive
+ # interface to the delta coder implementation.
+ if len(history) >= MAX_HISTORY_LEN:
+ # Delta to original file.
+ self.get_file(history[-1][0], old_file)
+ parent0 = self.delta_coder.make_delta(history[-1:],
+ old_file,
+ new_file,
+ oldest_delta)
+ # Full reinsert
+ parent1 = self.delta_coder.make_full_insert(new_file,
+ tmp_file)
+
+ #print "full: %i old: %i delta: %i target: %i" % (
+ # os.path.getsize(tmp_file),
+ # history[-1][6],
+ # os.path.getsize(oldest_delta),
+ # COALESCE_FACTOR * os.path.getsize(tmp_file))
+
+ # LATER: Back to this.
+ # This is bottom up history shortening driven by the most
+ # recent changes. We should also have some mechanism shortening
+ # history (to 1 link) for files which haven't changed in many
+ # updates, whenever blocks are merged.
+ # Hmmmm... hard (impossible?) to decouple from manifest because
+ # files are addressed by head history link sha
+ if (COALESCE_FACTOR * os.path.getsize(tmp_file) <
+ (os.path.getsize(oldest_delta) + history[-1][6])):
+ parent = parent1
+ blob_file = tmp_file
+ #print "SHORTENED: FULL REINSERT"
+ else:
+ #print "history:"
+ #for link in history:
+ # print " ", str_sha(link[0]), str_sha(link[2])
+
+ parent = parent0
+ #print
+ #print "parent: ", str_sha(parent)
+
+ blob_file = oldest_delta
+ #print "SHORTENED: COMPRESSED DELTAS"
+ else:
+ self.get_file(history_sha, old_file)
+ parent = self.delta_coder.make_delta(history, old_file,
+ new_file,
+ tmp_file)
+ blob_file = tmp_file
+
+
+ self.blocks.update_links.append(
+ write_raw_link(self.blocks.update_stream,
+ self.age + 1, parent,
+ blob_file, 0))
+ return self.blocks.update_links[-1]
+ finally:
+ self.blocks.tmps.remove_temp_file(old_file)
+ self.blocks.tmps.remove_temp_file(oldest_delta)
+ self.blocks.tmps.remove_temp_file(tmp_file)
+
+ def require_blocks(self):
+ """ INTERNAL: Raises if the BlockStorage delegate isn't initialized."""
+ if self.blocks is None:
+ raise Exception("Uninitialized. Run create() or load().")
+
+ def start_update(self):
+ """ Create temporary storage required to update the archive. """
+ self.require_blocks()
+ self.blocks.start_update()
+
+ def abandon_update(self):
+ """ Abandon all changes made to the archive since
+ start_update() and free temporary storage. """
+ if not self.blocks is None: # Hmmmm...
+ self.blocks.abandon_update()
+
+ # Allowed to drop history not in the referenced shas
+ # list.
+ #
+ # Returns an (blocks_added, blocks_removed) tuple.
+ def commit_update(self, referenced_shas=None):
+ """ Permanently write changes into the archive. """
+
+ self.require_blocks()
+ if referenced_shas is None:
+ referenced_shas = set([])
+ self.age = self.blocks.commit_update(referenced_shas)
+ self.compress(referenced_shas)
+
+
+ # Restores length and ordering invariants.
+ def compress(self, referenced_shas):
+ """ Compresses the archive to fit in max_blocks blocks.
+
+ REQUIRES: self.blocks.total_blocks() > max_blocks
+
+ Merges blocks such that:
+ n <= max_blocks
+ and
+ block[0] < block[1] ... < block[n -1]
+ """
+
+ if referenced_shas is None:
+ referenced_shas = set([])
+
+ check_shas(referenced_shas)
+
+ #count = self.blocks.nonzero_blocks()
+
+ # Compute the "real" size of each block without unreferenced links
+ real_lens = [0 for dummy in range(0, len(self.blocks.tags))]
+
+ for links in self.blocks.link_map.values():
+ for link in links:
+ if not link[0] in referenced_shas:
+ continue
+ real_lens[link[5]] += link[6]
+
+ uncompressed = [[index, index, real_lens[index]]
+ for index in range(0, len(self.blocks.tags))]
+
+ compressed = compress(uncompressed, self.max_blocks)
+ # Can't put lists in a set.
+ compressed = [tuple(value) for value in compressed]
+ uncompressed = [tuple(value) for value in uncompressed]
+
+ if compressed == uncompressed:
+ return False
+
+ self.blocks.update_blocks(uncompressed, compressed,
+ referenced_shas, self.max_blocks)
+ return True
+
+ def referenced_shas(self, head_sha_list, include_updates=True):
+ """ Return the SHA1 hashes of all history links referenced by the
+ links in the head_sha_list. """
+
+ check_shas(head_sha_list)
+
+ ret = set([])
+ for head_sha in head_sha_list:
+ for link in self.blocks.get_history(head_sha):
+ ret.add(link[0])
+ if include_updates:
+ ret = ret.union(self.uncommited_shas())
+
+ # Hmmm... frozenset faster?
+ return ret
+
+ def uncommited_shas(self):
+ """ Return a set of SHA1 hash digests for history links that have
+ been added since start_update().
+
+ Note that get_file() fails for these SHA1 because the aren't
+ commited yet. """
+ return set([link[0] for link in
+ self.blocks.update_links])
+
+class UpToDateException(Exception):
+ """ Raised to signal that no changes were required to the archive. """
+ def __init__(self, msg):
+ Exception.__init__(self, msg)
+
diff --git a/wormarc/binaryrep.py b/wormarc/binaryrep.py
new file mode 100644
--- /dev/null
+++ b/wormarc/binaryrep.py
@@ -0,0 +1,228 @@
+""" Functions to read and write binary representation of archive data.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+# REDFLAG: Only tested on x86 32-bit Intel Linux. Alignment/endedness issues?
+# REDFLAG: OK to read/write byte strings directly w/o (un)pack'ing, right?
+# REDFLAG: REDUCE RAM: do chunked read/writes/hash digests where possible.
+
+import struct
+from binascii import hexlify
+from hashlib import sha1
+
+NULL_SHA = '\x00' * 20
+
+LINK_HEADER_FMT = '!LL20s'
+LINK_HEADER_LEN = struct.calcsize(LINK_HEADER_FMT)
+
+COUNT_FMT = "!L"
+COUNT_LEN = struct.calcsize(COUNT_FMT)
+
+# REDFLAG: doc <16k name length
+MANIFEST_ENTRY_HDR_FMT = "!H20s20s"
+MANIFEST_ENTRY_HDR_LEN = struct.calcsize(MANIFEST_ENTRY_HDR_FMT)
+MANIFEST_ENTRY_FMT = MANIFEST_ENTRY_HDR_FMT + "%is"
+
+MSG_INCOMPLETE_READ = "Bad stream, EOF during read."
+
+READ_CHUNK_LEN = 1024 * 16
+
+def str_sha(raw_sha):
+ """ Return a 12 digit hex string for a raw SHA1 hash. """
+ return hexlify(raw_sha)[:12]
+
+# Used to catch pilot error which otherwise shows up as weird failures.
+def check_shas(raw_sha_sequence):
+ """ INTERNAL: Raise a ValueError if the sequence values don't look like
+ raw SHA1 hashes. """
+ if raw_sha_sequence is None:
+ raise ValueError("SHA1 has sequence is None?")
+ for value in raw_sha_sequence:
+ if value is None:
+ raise ValueError("None instead of binary SHA1 digest")
+
+ if not len(value) == 20:
+ raise ValueError("Doesn't look like a binary SHA1 digest: %s" %
+ repr(value))
+
+def checked_read(in_stream, length, allow_eof=False):
+ """ Read a fixed number of bytes from an open input stream.
+
+ Raises IOError if EOF is encountered before all bytes are read.
+ """
+
+ bytes = in_stream.read(length)
+ if allow_eof and bytes == '':
+ return bytes
+ if len(bytes) != length:
+ raise IOError(MSG_INCOMPLETE_READ)
+ return bytes
+
+# Wire rep:
+# <total length><age><parent><blob data>
+#
+# Python rep
+# 0 1 2 3 4 5 6
+# (sha1, age, parent, data, stream_offset, stream_index, physical_len)
+#
+# sha1 is hash of parent + data
+# physical_len is the number of bytes of storage used to persist
+# the link.
+def read_link(in_stream, keep_data=True, pos=None, stream_index=None):
+ """ Read a single history link from an open stream. """
+
+ bytes = checked_read(in_stream, LINK_HEADER_LEN, True)
+ if bytes == '':
+ return None # Clean EOF
+
+ length, age, parent = struct.unpack(LINK_HEADER_FMT, bytes)
+ payload_len = length - LINK_HEADER_LEN # already read header
+ raw = checked_read(in_stream, payload_len)
+
+ # READFLAG: incrementally read / hash
+ sha_value = sha1(str(age))
+ sha_value.update(parent)
+ sha_value.update(raw)
+
+ if not keep_data:
+ raw = None
+
+ return (sha_value.digest(), age, parent, raw,
+ pos, stream_index, payload_len)
+
+
+def copy_raw_links(in_stream, out_stream, allowed_shas, copied_shas):
+ """ Copy any links with SHA1 hashes in allowed_shas from in_instream to
+ out_stream.
+ """
+ count = 0
+ while True:
+ hdr = checked_read(in_stream, LINK_HEADER_LEN, True)
+ if hdr == '':
+ return count # Clean EOF
+ length, age, parent = struct.unpack(LINK_HEADER_FMT, hdr)
+ sha_value = sha1(str(age))
+ sha_value.update(parent)
+ rest = checked_read(in_stream, length - LINK_HEADER_LEN)
+ sha_value.update(rest)
+ value = sha_value.digest()
+ if value in copied_shas:
+ continue # Only copy once.
+
+ if allowed_shas is None or value in allowed_shas:
+ out_stream.write(hdr)
+ out_stream.write(rest)
+ count += 1
+ copied_shas.add(value)
+
+# Sets pos, but caller must fix stream index
+def write_raw_link(out_stream, age, parent, raw_file, stream_index):
+ """ Write a history link to an open stream.
+
+ Returns a history link tuple for the link written. """
+
+ assert len(parent) == 20 # Raw, not hex string
+
+ pos = out_stream.tell()
+ in_file = open(raw_file, 'rb')
+ try:
+ raw = in_file.read()
+
+ out_stream.write(struct.pack(LINK_HEADER_FMT,
+ len(raw) + LINK_HEADER_LEN,
+ age,
+ parent))
+
+ sha_value = sha1(str(age))
+ sha_value.update(parent)
+
+ out_stream.write(raw)
+ # REDFLAG: read / hash incrementally
+ sha_value.update(raw)
+ finally:
+ in_file.close()
+
+ return (sha_value.digest(), age, parent, None,
+ pos, stream_index, len(raw) + LINK_HEADER_LEN)
+
+def write_file_manifest(name_map, out_stream):
+ """ Write file manifest data to an open stream. """
+
+ out_stream.write(struct.pack(COUNT_FMT, len(name_map)))
+ # Sort to make it easier for diff algos to find contiguous
+ # changes.
+ names = name_map.keys()
+ names.sort()
+ for name in names:
+ length = MANIFEST_ENTRY_HDR_LEN + len(name)
+ file_sha, history_sha = name_map[name]
+
+ out_stream.write(struct.pack(MANIFEST_ENTRY_FMT % len(name),
+ length,
+ file_sha,
+ history_sha,
+ name))
+def read_file_manifest(in_stream):
+ """ Read file manifest data from an open input stream. """
+ count = struct.unpack(COUNT_FMT, checked_read(in_stream, COUNT_LEN))[0]
+ name_map = {}
+ for dummy in range(0, count):
+ length, file_sha, history_sha = \
+ struct.unpack(MANIFEST_ENTRY_HDR_FMT,
+ checked_read(in_stream,
+ MANIFEST_ENTRY_HDR_LEN))
+
+ length -= MANIFEST_ENTRY_HDR_LEN
+ name = checked_read(in_stream, length)
+
+ assert not name in name_map
+ name_map[name] = (file_sha, history_sha)
+ return name_map
+
+def manifest_to_file(file_name, name_map):
+ """ Write a single manifest to a file. """
+ out_file = open(file_name, 'wb')
+ try:
+ write_file_manifest(name_map, out_file)
+ finally:
+ out_file.close()
+
+def manifest_from_file(file_name):
+ """ Read a single manifest from a file. """
+ in_file = open(file_name, 'rb')
+ try:
+ return read_file_manifest(in_file)
+ finally:
+ in_file.close()
+
+def get_file_sha(full_path):
+ """ Return the 20 byte sha1 hash digest of a file. """
+ in_file = open(full_path, 'rb')
+ try:
+ sha_value = sha1()
+ while True:
+ bytes = in_file.read(READ_CHUNK_LEN)
+ if bytes == "":
+ break
+ sha_value.update(bytes)
+ return sha_value.digest()
+ finally:
+ in_file.close()
+
diff --git a/wormarc/blocknames.py b/wormarc/blocknames.py
new file mode 100644
--- /dev/null
+++ b/wormarc/blocknames.py
@@ -0,0 +1,75 @@
+""" Classes used by BlockStorage to map block ordinals to file names.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+# Grrrr... separate file to avoid circular dependency.
+
+
+import os
+
+BLOCK_SUFFIX = '.bin'
+
+class BlockNames:
+ """ ABC to map ordinals to file names. """
+ def __init__(self, read_only):
+ self.read_only = read_only
+
+ def read_path(self, ordinal):
+ """ Return a file name to read the block from. """
+ raise NotImplementedError()
+
+ def write_path(self, ordinal):
+ """ Return a file name to write the block to.
+ This can raise a ValueError if the blocks are read only.
+ """
+ if self.read_only:
+ raise ValueError("Blocks are read only!")
+ return self.read_path(ordinal)
+
+class ReadWriteNames(BlockNames):
+ """ A naming policy for an updateable set of blocks. """
+ def __init__(self, block_dir, block_name, suffix):
+ BlockNames.__init__(self, False)
+ self.block_dir = block_dir
+ self.block_name = block_name
+ self.suffix = suffix
+
+ def read_path(self, ordinal):
+ """ Implement pure virtual. """
+ return os.path.join(self.block_dir, "%s_%s%s" %
+ (self.block_name,
+ str(ordinal),
+ self.suffix))
+
+# UNTESTED!
+# DESIGN INTENT: Adapter that allows you to load a BlockStorage from
+# a static cache of CHK blocks.
+class ReadOnlyNames(BlockNames):
+ """ A naming policy for a read only set of blocks. """
+ def __init__(self, read_only_file_names):
+ BlockNames.__init__(self, True)
+ self.file_names = read_only_file_names
+
+ def read_path(self, ordinal):
+ """ Implement pure virtual. """
+ if ordinal < 0 or ordinal >= len(self.file_names):
+ raise IndexError("No such file: %i" % ordinal)
+ return self.file_names[ordinal]
+
diff --git a/wormarc/blocks.py b/wormarc/blocks.py
new file mode 100644
--- /dev/null
+++ b/wormarc/blocks.py
@@ -0,0 +1,461 @@
+""" Classes to store collections of archive history links in files.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+# REDFLAG: CLEANUP ERROR HANDLING. Failures can lose or corrupt blocks!
+
+import os
+
+from archive import MIN_BLOCK_LEN, UpToDateException
+from linkmap import LinkMap
+from binaryrep import NULL_SHA, copy_raw_links
+
+# REDFLAG: rtfm python tempfile module. is this really needed?
+class ITempFileManager:
+ """ Delegate to handle temp file creation and deletion. """
+ def __init__(self):
+ pass
+ def make_temp_file(self):
+ """ Return a new unique temp file name including full path. """
+ raise NotImplementedError()
+ def remove_temp_file(self, name):
+ """ Remove and existing temp file. """
+ raise NotImplementedError()
+
+def has_internal_zero(sequence):
+ """ Return True if the sequence has a zero to non-zero transition,
+ False otherwise. """
+ saw_zero = False
+ for value in sequence:
+ if value == 0:
+ saw_zero = True
+ else:
+ if saw_zero:
+ return True
+ return False
+
+# DESIGN INTENT: Push file system dependancies out of archive code.
+class BlockStorage:
+ """ A class to store history links in a collection of files. """
+ def __init__(self, tmps, name_policy=None):
+ self.tmps = tmps
+ self.names = name_policy
+ self.tags = ['', ] # Is also a proxy for length.
+
+ self.link_map = None
+
+ # Hmmmm... file and stream belong in storage
+ # but links belongs in the archive....
+ self.update_file = None
+ self.update_stream = None
+ self.update_links = []
+
+ def is_updating(self):
+ """ Return True if updating, False otherwise. """
+ # Hmmm...
+ return not self.update_stream is None
+
+ def close(self):
+ """ Close the files. """
+ self.abandon_update()
+ if self.link_map is None:
+ return
+ self.link_map.close()
+ self.link_map = None
+
+ def full_path(self, ordinal, read=True):
+ """ Return the full path to an underlying block file. """
+ if read:
+ return self.names.read_path(ordinal)
+
+ return self.names.write_path(ordinal)
+
+ def get_history(self, head_sha1):
+ """ Return the history link chain which has a head link with hash
+ head_sha1. """
+ if head_sha1 == NULL_SHA:
+ return []
+ ret = []
+ head = head_sha1
+ while True:
+ link = self.link_map.get_link(head)
+ ret.append(link[:]) # Copy
+ if link[2] == NULL_SHA:
+ return ret
+ head = link[2]
+
+ def start_update(self):
+ """ Create temporary storage required to write an update.
+
+ You MUST call commit_update() or abandon_update() after
+ calling this. """
+
+ if not self.update_file is None:
+ raise Exception("Commmit or abandon the previous update!")
+ self.update_file = self.tmps.make_temp_file()
+ self.update_links = []
+ raised = True
+ try:
+ self.update_stream = open(self.update_file, "wb")
+ raised = False
+ finally:
+ if raised:
+ self.abandon_update()
+
+ # UpToDateException is recoverable, all others fatal. i.e. zorch instance.
+ def commit_update(self, referenced_shas=None):
+ """ Permanently write changes into the archive.
+
+ This creates a new block which may replace an
+ existing one. """
+
+ assert not referenced_shas is None
+
+ if self.update_file is None or len(self.update_links) == None:
+ UpToDateException("No changes to commit.")
+
+ age = 0
+ # Automagically add history for self and parents
+ for link in self.update_links:
+ age = max(age, link[1])
+ # New link
+ referenced_shas.add(link[0])
+ # Previous history
+ # TRICKY: You can't call get_history on the new link itself
+ # because it isn't commited yet.
+ for child in self.get_history(link[2]):
+ referenced_shas.add(child[0])
+
+ try:
+ self.update_stream.close()
+ self.update_stream = None # see is_updating()
+ self.add_block(referenced_shas)
+ return age
+ finally:
+ # Always clean up, even on success.
+ # EXCEPTIONS ARE FATAL!
+ self.abandon_update()
+
+ def abandon_update(self):
+ """ Free temporary storage associated with an update without
+ committing it. """
+
+ self.update_links = []
+ if not self.update_stream is None:
+ self.update_stream.close()
+ self.update_stream = None
+ if not self.update_file is None:
+ self.tmps.remove_temp_file(self.update_file)
+ self.update_file = None
+
+ # Returns 0
+ def create(self, name_policy, num_blocks, overwrite=False ):
+ """ Initialize the instance by creating a new set of empty
+ block files. """
+
+ if name_policy.read_only:
+ raise ValueError("Names are read only! Use load() instead?")
+ self.names = name_policy
+ self.tags = ['', ] # Length == 1
+ if not overwrite:
+ for ordinal in range(0, num_blocks):
+ if os.path.exists(self.full_path(ordinal, False)):
+ raise IOError("Already exists: %s" %
+ self.full_path(ordinal, False))
+
+ for ordinal in range(0, num_blocks):
+ out_file = open(self.full_path(ordinal, False), 'wb')
+ out_file.close()
+
+ return self.load(name_policy, num_blocks)
+
+ # hmmmm... want to use hash names for blocks
+ # blocks is [[file_name, desc, dirty], ...]
+ # returns maximum age
+ def load(self, name_policy, num_blocks, tags=None):
+ """ Initialize the instance by loading from an existing set of
+ block files. """
+
+ self.names = name_policy
+ if tags is None:
+ tags = ['' for dummy in range(0, num_blocks)]
+
+ # DESIGN INTENT: Meant for keeping track of Freenet CHK's
+ assert len(tags) == num_blocks
+ self.tags = tags[:]
+ self.link_map = LinkMap()
+ age, counts = self.link_map.read([self.full_path(ordinal, False)
+ for ordinal in range(0, num_blocks)])
+ assert not has_internal_zero(counts)
+ if max(counts) == 0:
+ self.tags = self.tags[:1] # Length == 1
+ return age
+
+ # Includes 0 length blocks
+ def total_blocks(self):
+ """ Return the total number of blocks including 0 length ones. """
+ return len(self.tags)
+
+ # Hmmmm... physical length.
+ def nonzero_blocks(self):
+ """ Return the number of non-zero length blocks. """
+ for ordinal in range(0, len(self.tags)):
+ if os.path.getsize(self.full_path(ordinal)) == 0:
+ # Check for illegal internal zero length blocks.
+ for index in range(ordinal + 1, len(self.tags)):
+ if os.path.exists(self.full_path(index)):
+ assert os.path.getsize(self.full_path(index)) == 0
+ return ordinal
+
+ return len(self.tags)
+
+ # This may delete self.update_file, but caller is
+ # still responsible for cleaning it up.
+ #
+ # Breaks length and ordering invariants.
+ def add_block(self, referenced_shas, tag=''):
+ """ INTERNAL: Add the temporary update file to the permanent
+ block files. """
+
+ assert not self.is_updating()
+ assert not self.names.read_only
+ update_len = os.path.getsize(self.update_file)
+ head_len = os.path.getsize(self.full_path(0, False))
+ tmp = None
+ try:
+ # Doesn't change length
+ if update_len + head_len < MIN_BLOCK_LEN:
+ # Link map has open file descriptors.
+ # Must close or os.remove() below fails on Windows.
+ self.link_map.close()
+
+ # We might merge with an empty block here, but it
+ # doesn't matter since the length is bounded. Do better?
+
+ # Can just append to the first block.
+ # [N + O1] ...
+ tmp = self.merge_blocks((self.update_file,
+ self.full_path(0, False)),
+ referenced_shas)
+ if os.path.exists(self.full_path(0, False)):
+ # REDFLAG: What if this fails?
+ os.remove(self.full_path(0, False))
+
+ os.rename(tmp, self.full_path(0, False))
+ self.tags[0] = tag
+
+ # Deletes update file IFF we get here.
+ tmp = self.update_file
+ self.update_file = None
+
+ # Fix the link map.
+ fixups = {} # Drop links in head block.
+ for index in range(1, self.total_blocks()):
+ fixups[index] = index
+ # Potentially SLOW.
+ self.link_map.update_blocks(fixups,
+ [self.full_path(index, False)
+ for index in
+ range(0, self.total_blocks())],
+ [0,]) # Read links from head block.
+ return
+
+
+ # Deletes update file always.
+ tmp = self.update_file
+ self.update_file = None
+
+ # Close the link map before messing with files.
+ self.link_map.close()
+
+ self.prepend_block(tmp)
+ self.tags.insert(0, tag) # Increments implicit length
+
+ # Fix the link map.
+ fixups = {}
+ for index in range(0, self.total_blocks() - 1): # We inserted!
+ fixups[index] = index + 1
+ # Potentially SLOW.
+ self.link_map.update_blocks(fixups,
+ [self.full_path(index, False)
+ for index in
+ range(0, self.total_blocks())],
+ [0,]) # Read links from head block.
+ finally:
+ self.tmps.remove_temp_file(tmp)
+
+ # Returns tmp file with merged blocks.
+ # Caller must delete tmp file.
+ def merge_blocks(self, block_file_list, referenced_shas):
+ """ INTERNAL: Merge blocks into a single file. """
+ tmp = self.tmps.make_temp_file()
+ copied_shas = set([])
+ raised = True
+ try:
+ out_file = open(tmp, 'wb')
+ try:
+ for name in block_file_list:
+ in_file = open(name, "rb")
+ try:
+ # Hmmm... do something with count?
+ #count = copy_raw_links(in_file, out_file,
+ # referenced_shas)
+ copy_raw_links(in_file, out_file,
+ referenced_shas, copied_shas)
+ finally:
+ in_file.close()
+ finally:
+ out_file.close()
+ raised = False
+ return tmp
+ finally:
+ if raised:
+ self.tmps.remove_temp_file(tmp)
+
+ # Implementation helper function, caller deals with file cleanup.
+ # REQUIRES: new_block not an extant block file.
+ def prepend_block(self, new_block):
+ """ INTERNAL: Insert a new block at the head of the block list. """
+
+ assert not self.is_updating()
+ assert self.update_file is None
+ # Shift all extant blocks up by one index
+ for index in range(self.total_blocks() - 1, -1, -1):
+ if os.path.exists(self.full_path(index + 1, False)):
+ # REDFLAG: failure?
+ os.remove(self.full_path(index + 1, False))
+ # REDFLAG: failure?
+ os.rename(self.full_path(index, False),
+ self.full_path(index + 1, False))
+ # Now copy the update block into the 0 position.
+ os.rename(new_block, self.full_path(0, False))
+
+
+ def _make_new_files(self, new_blocks, referenced_shas, tmp_files):
+ """ INTERNAL: Implementation helper for update_blocks(). """
+ new_files = {}
+ for partition in new_blocks:
+ # Calling code should have already dropped empty blocks.
+ new_files[partition] = self.merge_blocks([self.full_path(index,
+ False)
+ for index in
+ range(partition[0],
+ partition[1]
+ + 1)],
+ referenced_shas)
+ tmp_files.append(new_files[partition])
+ return new_files
+
+ def _remove_old_files(self, dropped_blocks):
+ """ INTERNAL: Implementation helper for update_blocks(). """
+ # Delete the files for dropped blocks
+ for partition in dropped_blocks:
+ assert partition[0] == partition[1]
+ if not os.path.exists(self.full_path(partition[0], False)):
+ continue
+ os.remove(self.full_path(partition[0], False))
+
+ def _copy_old_blocks(self, old_blocks, tmp_files):
+ """ INTERNAL: Implementation helper for update_blocks(). """
+ renamed = {}
+ for partition in old_blocks:
+ assert partition[0] == partition[1]
+
+ src = self.full_path(partition[0], False)
+ assert os.path.exists(src)
+ dest = self.tmps.make_temp_file()
+ tmp_files.append(dest)
+ os.rename(src, dest)
+ renamed[partition] = dest
+ return renamed
+
+ def _update_block_files(self, compressed, uncompressed,
+ referenced_shas, tmp_files):
+ """ INTERNAL: Implementation helper for update_blocks(). """
+
+ # Hmmm... to appease pylint max local vars constraint.
+ #new_blocks = set(compressed) - set(uncompressed)
+ old_blocks = set(compressed).intersection(set(uncompressed))
+ #dropped_blocks = set(uncompressed) - old_blocks
+
+ # Build new blocks in tmp files
+ new_files = self._make_new_files(set(compressed) - set(uncompressed),
+ referenced_shas,
+ tmp_files)
+ # Delete the files for dropped blocks
+ self._remove_old_files(set(uncompressed) - old_blocks)
+ # Move old blocks into tmp files
+ renamed = self._copy_old_blocks(old_blocks, tmp_files)
+
+ new_tags = ['' for dummy in range(0, len(compressed))]
+ new_indices = []
+ ordinal_fixups = {}
+ # Rename blocks onto new block ordinals
+ for index, block in enumerate(compressed): #hmmm not a set???
+ dest = self.full_path(index, False)
+ assert not os.path.exists(dest)
+ if block in set(compressed) - set(uncompressed):
+ os.rename(new_files[block], dest)
+ new_tags[index] = 'new' # best we can do.
+ new_indices.append(index)
+ continue
+
+ assert block in old_blocks
+ os.rename(renamed[block], dest)
+ # Copy the old tag value into the right position
+ new_tags[index] = self.tags[block[0]]
+ # Save info we need to fix the link_map
+ ordinal_fixups[block[0]] = index
+ self.tags = new_tags
+ return (new_tags, new_indices, ordinal_fixups)
+
+ # REDFLAG: Failure.
+ def update_blocks(self, uncompressed, compressed, referenced_shas,
+ min_blocks):
+ """ Repartition the underlying block files into the partitions
+ described by compressed. """
+
+ assert not self.is_updating()
+ assert not self.names.read_only
+
+ tmp_files = []
+ try:
+ self.link_map.close()
+ self.tags, new_indices, ordinal_fixups = \
+ self._update_block_files(compressed, uncompressed,
+ referenced_shas, tmp_files)
+
+ # Drop links for unreferenced blocks and shift indices.
+ # Then read links from new block files.
+ self.link_map.update_blocks(ordinal_fixups,
+ [self.full_path(index, False)
+ for index in
+ range(0, self.total_blocks())],
+ new_indices)
+
+ # Add trailing zero length blocks.
+ for index in range(self.nonzero_blocks(), min_blocks):
+ out_file = open(self.full_path(index, False), 'wb')
+ out_file.close()
+ finally:
+ for name in tmp_files:
+ self.tmps.remove_temp_file(name)
+
+
diff --git a/wormarc/deltacoder.py b/wormarc/deltacoder.py
new file mode 100644
--- /dev/null
+++ b/wormarc/deltacoder.py
@@ -0,0 +1,208 @@
+""" A delta encoder/decoder based on Mercurial's binary diff/patch code.
+
+ ATTRIBUTION: Contains source fragements written by Matt Mackall.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+# For names in pillaged Mercurial code.
+# pylint: disable-msg=C0103, W0141
+
+import zlib
+from mercurial import mdiff
+
+
+from binaryrep import NULL_SHA
+############################################################
+# ATTRIBUTION: Pillaged from Mercurial revlog.py by Matt Mackall
+# Then hacked, so bugs are mine.
+_compress = zlib.compress
+_decompress = zlib.decompress
+
+def compress(text):
+ """ generate a possibly-compressed representation of text """
+ if not text:
+ return ("", text)
+ l = len(text)
+ bin = None
+ if l < 44: # Is this Mercurial specific or a zlib overhead thing?
+ pass
+ elif l > 1000000:
+ # zlib makes an internal copy, thus doubling memory usage for
+ # large files, so lets do this in pieces
+ z = zlib.compressobj()
+ p = []
+ pos = 0
+ while pos < l:
+ pos2 = pos + 2**20
+ p.append(z.compress(text[pos:pos2]))
+ pos = pos2
+ p.append(z.flush())
+ if sum(map(len, p)) < l:
+ bin = "".join(p)
+ else:
+ bin = _compress(text)
+ if bin is None or len(bin) > l:
+ if text[0] == '\0':
+ return ("", text)
+ return ('u', text)
+ return ("", bin)
+
+def decompress(bin):
+ """ decompress the given input """
+ if not bin:
+ return bin
+ t = bin[0]
+ if t == '\0':
+ return bin
+ if t == 'x':
+ return _decompress(bin)
+ if t == 'u':
+ return bin[1:]
+
+ raise Exception("unknown compression type %r" % t)
+
+ # _ is a function defined in i18n.py to call i18n.gettext.
+ #raise RevlogError(_("unknown compression type %r") % t)
+
+############################################################
+
+# REDFLAG: wants_stream ENOTIMPL, who closes stream?
+# Returns raw patch data if if it's not set
+# returns a readable stream if wants_stream is True, otherwise the raw data
+# def example_get_data_func(history_link, wants_stream=False):
+# pass
+
+class DeltaCoder:
+ """ Wrapper around the delta compression/decompression implementation
+ used by the Mercurial Revlog.
+
+ See revlog.py, mdiff.py, mpatch.c, bdiff.c in Mercurial codebase.
+ """
+ def __init__(self):
+ self.get_data_func = lambda x:None
+ self.tmp_file_mgr = None
+
+ # Define an ABC? What would the runtime overhead be?
+ # Subclass might need tmp_file_mgr or get_data_func.
+ # pylint: disable-msg=R0201
+ def make_full_insert(self, new_file, out_file_name,
+ disable_compression=False):
+ """ Make a blob readable by apply_deltas containing the entire file. """
+
+ in_file = open(new_file, 'rb')
+ raw_new = None
+ try:
+ raw_new = in_file.read()
+ finally:
+ in_file.close()
+
+ if disable_compression:
+ values = ('u', raw_new)
+ else:
+ values = compress(raw_new)
+
+ out_file = open(out_file_name, 'wb')
+ try:
+ if values[0]:
+ out_file.write(values[0])
+ out_file.write(values[1])
+ finally:
+ out_file.close()
+
+ return NULL_SHA
+
+ # Writes a new delta blob into out_files
+ # Returns parent sha1.
+ # Can truncate history by returning NULL_SHA
+ def make_delta(self, history_chain, old_file, new_file, out_file_name):
+ """ Make a new binary change blob and write it into out_file_name.
+
+ """
+ if len(history_chain) == 0:
+ #print "DOING FULL INSERT"
+ return self.make_full_insert(new_file, out_file_name)
+
+ #print "MAKING DELTA"
+ in_file = open(new_file, 'rb')
+ raw_new = None
+ try:
+ raw_new = in_file.read()
+ finally:
+ in_file.close()
+
+ parent = NULL_SHA
+ in_old = open(old_file, 'rb')
+ try:
+ raw_old = in_old.read()
+ values = compress(mdiff.textdiff(raw_old, raw_new))
+ parent = history_chain[0][0]
+ out_file = open(out_file_name, 'wb')
+ try:
+ if values[0]:
+ out_file.write(values[0])
+ out_file.write(values[1])
+ finally:
+ out_file.close()
+ finally:
+ in_old.close()
+
+ return parent
+
+ # All text and patches kept in RAM.
+ # Rebuilds the file by applying all the deltas in the history chain.
+ def apply_deltas(self, history_chain, out_file_name):
+ """ Rebuild a file from a series of patches and write it into
+ out_file_name. """
+ assert len(history_chain) > 0
+
+ deltas = []
+ text = None
+ index = 0
+ while index < len(history_chain):
+ link = history_chain[index]
+ if link[2] == NULL_SHA:
+ text = link[3]
+ if text is None:
+ text = self.get_data_func(link[0])
+ break
+
+ delta = link[3]
+ if delta is None:
+ delta = self.get_data_func(link[0])
+ assert not delta is None
+ deltas.append(delta)
+ index += 1
+
+ assert not text is None
+ text = decompress(text)
+ if len(deltas) == 0:
+ raw = text
+ else:
+ for index in range(0, len(deltas)):
+ deltas[index] = decompress(deltas[index])
+ deltas.reverse() # iterate in reverse?
+ raw = mdiff.patches(text, deltas)
+
+ text = None
+ out_file = open(out_file_name, "wb")
+ try:
+ out_file.write(raw)
+ finally:
+ out_file.close()
diff --git a/wormarc/design.txt b/wormarc/design.txt
new file mode 100644
--- /dev/null
+++ b/wormarc/design.txt
@@ -0,0 +1,74 @@
+djk20091206 -- personal notes, probably not very useful to anyone else.
+
+OVERVIEW:
+Git/hg lite. file is a linear change of delta encoded patches. patches
+referenced by sha(parent + data) Archive is a collection of patches,
+has one "root object" patch chain head. FileManifest is an arbitrary
+mapping of human readable names to patch chain heads
+
+patch
+ HASA *one* parent (but multiple patches can have the same parent)
+ HASA sha1
+ HASA age
+
+patch chain
+ HASA ordered sequences of patch'es
+
+file
+ ISA patch chain
+
+Manifest
+ ISA file
+ HASA file sha -> patchchain map
+
+ can determine every referenced patch by walking all the patch chains
+ can generate new patches for an update from a file list
+
+Archive
+ HASA unordered collection of patches
+ HASA "special" patch chain sha1 which points to the root object.
+
+
+ can partition patches across a bounded number of read only files
+ can map patch sha1s to patches (via Blocks, LinkMap)
+ tries to write as little as possible
+ tries to update the oldest files as little as possible
+ knows to drop unreferenced patch chains
+ could incrementally update a local directory based on locally cached archive state
+ ??? COOL, BUT REALLY REQUIRED?, requires age
+ i.e. rsync like behaviour
+top key
+ HASA ordered sequence of bundle CHKS
+ HASA root object sha
+
+ Is the in Freenet rep of an Archive
+
+
+PROS:
+sites load fully in a bounded number of freenet fetches
+CHKs for older content used in newer updates.
+Easy to implement top key redundancy, reinsertion
+
+CONS:
+Slower in typical use case?
+ offset by local caching of old bundles
+ fast binary patching algo
+Requires either lots of memory or fast random access to encrypted storage.
+"Defrag" usability issue. Inserts are usually fast, but could take a long time.
+Inserting is stateful. i.e. you must be able to fetch the previous version.
+Complexity. Need to disable diffs for static files (e.g. images)
+
+USES:
+Wiki
+Freesites
+PyFreenetHg
+git?
+
+Important Classes:
+WORMBlockArchive -- The archive
+Blocks -- Delegate of WORMBBlockArchive to handle reading /
+ writing patch chains to files.
+BlockNames -- Delegate of Blocks to abstract ordinal to file name mapping.
+LinkMap -- Delegate of Blocks to keep a SHA1 addressable map of patch chain links
+ManifestFile -- Map of human readable names to patch chain head SHA1s
+
diff --git a/wormarc/filemanifest.py b/wormarc/filemanifest.py
new file mode 100644
--- /dev/null
+++ b/wormarc/filemanifest.py
@@ -0,0 +1,485 @@
+""" Classes to address files stored in a WORMBlockArchive by
+ human readable name.
+
+ ATTRIBUTION: Contains source fragements written by Matt Mackall.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+import os
+import shutil
+
+from binaryrep import NULL_SHA, manifest_from_file, \
+ manifest_to_file, get_file_sha, check_shas, str_sha
+
+from archive import UpToDateException
+
+def is_printable_ascii(value):
+ """ Return True if all the characters in value are printable
+ ASCII, False otherwise. """
+ value = [ord(c) for c in value]
+ # Hmmm... allow spaces
+ return max(value) <= 0x7e and min(value) >= 0x20
+
+#----------------------------------------------------------#
+
+# Hmmmm... this feels horrifically overdesigned, but I need a way
+# to decouple the data that you can insert into a manifest from
+# the manifest implementation.
+class IManifestEntry:
+ """ Abstract base class for things that can be referenced
+ from a FileManifest. """
+ def __init__(self):
+ pass
+
+ def get_name(self):
+ """ Returns the name to insert this entry under in the manifest. """
+ #raise NotImplementedError()
+ pass
+
+ def make_file(self):
+ """ Returns the full path to the data to insert.
+ May create a temp file which it can clean up in release().
+ """
+ #raise NotImplementedError()
+ pass
+
+ # MUST call this.
+ def release(self):
+ """ Cleanup method called when the instance is no longer in use. """
+ #raise NotImplementedError()
+ pass
+
+class FileManifest:
+ """ An index which maps human readable names to files in an archive. """
+ def __init__(self, name_map=None, history_sha=NULL_SHA):
+ check_shas([history_sha, ])
+ if name_map == None:
+ name_map = {}
+
+ # name -> (file sha1, patch chain head sha1)
+ self.name_map = name_map
+ # Hmmmm... convenient, but it ties the manifest to an archive.
+ self.stored_sha = history_sha
+
+ @classmethod
+ def from_archive(cls, archive, history_sha):
+ """ Create a FileManifest from a file in the archive. """
+ check_shas([history_sha, ])
+ tmp_name = archive.blocks.tmps.make_temp_file()
+ try:
+ archive.get_file(history_sha, tmp_name)
+ # Hmmmm... age... put back in manifest?
+ name_map = manifest_from_file(tmp_name)
+ return FileManifest(name_map, history_sha)
+ finally:
+ archive.blocks.tmps.remove_temp_file(tmp_name)
+
+ # hmmmm... not to_archive, would expect that to be an instance member.
+ @classmethod
+ def write_manifest(cls, archive, name_map, history_sha):
+ """ Helper, writes updated manifest to archive.
+ Returns link.
+ """
+ check_shas([history_sha, ])
+ # Add manifest
+ tmp_file_name = archive.blocks.tmps.make_temp_file()
+ try:
+ manifest_to_file(tmp_file_name, name_map)
+ return archive.write_new_delta(history_sha, tmp_file_name)
+ finally:
+ archive.blocks.tmps.remove_temp_file(tmp_file_name)
+
+ def make_file_sha_map(self):
+ """ INTERNAL: Make a file_sha -> (file_sha, patch_sha) map
+ from name_map. """
+ file_sha_map = {}
+ for name in self.name_map:
+ pair = self.name_map[name]
+ file_sha_map[pair[0]] = pair
+ return file_sha_map
+
+ # Doesn't change manifest or archive.
+ def write_changes(self, archive, entry_infos, prev_manifest_sha=NULL_SHA):
+ """ INTERNAL: Helper function for update().
+
+ Writes the changes required to add the IManifestEntries
+ in entries_infos to an archive.
+
+ Raises UpToDateException if there are no changes.
+
+ Return an (updated_name_map, manifest_sha) tuple. """
+
+ check_shas([prev_manifest_sha, ])
+
+ file_sha_map = self.make_file_sha_map()
+ new_name_map = {}
+ updated = False
+
+ for info in entry_infos:
+ full_path = info.make_file()
+ try:
+ name = info.get_name()
+ if not is_printable_ascii(name):
+ raise IOError("Non-ASCII name: %s" % repr(name))
+ hash_info = self.name_map.get(name, None)
+ file_sha = get_file_sha(full_path)
+ if hash_info is None:
+ updated = True
+ if file_sha in file_sha_map:
+ # Renamed
+ new_name_map[name] = file_sha_map[file_sha]
+ else:
+ # REDFLAG: We lose history for files which are renamed
+ # and modified.
+ # Created (or renamed and modified)
+ link = archive.write_new_delta(NULL_SHA, full_path)
+ new_name_map[name] = (file_sha, link[0])
+ else:
+ if self.name_map[name][0] == file_sha:
+ # Exists in manifest and is unmodified.
+ new_name_map[name] = self.name_map[name]
+ continue
+
+ # Modified
+ updated = True
+ link = archive.write_new_delta(self.name_map[name][1],
+ full_path)
+ new_name_map[name] = (file_sha, link[0])
+
+ # delete == ophaned history, NOP
+ finally:
+ info.release()
+
+ if not updated:
+ if (frozenset(new_name_map.keys()) ==
+ frozenset(self.name_map.keys())):
+ raise UpToDateException("The file manifest is up to date.")
+
+ # Add updated manifest
+ link = FileManifest.write_manifest(archive, new_name_map,
+ prev_manifest_sha)
+
+ return (new_name_map, link[0])
+
+ # Only works if fully committed!
+ def all_shas(self, archive):
+ """ Return the SHA1 hashes of all history links required to store
+ the files referenced by the manifest. """
+ shas = [entry[1] for entry in self.name_map]
+ shas.add(self.stored_sha)
+ history_shas = set([])
+ for value in shas:
+ history_shas.union(set([link[0] for link in
+ archive.blocks.get_history(value)]))
+ return shas.union(history_shas)
+
+ # Changes both the manifest and the archive.
+ # other_head_shas is for other files in the archive not
+ # handled by this manifest.
+ def update(self, archive, entry_infos, other_head_shas=None,
+ truncate_manifest_history=False):
+ """ Update the manifest with the changes in entry infos and
+ write the changes and the updated manifest into the archive. """
+ if other_head_shas is None:
+ other_head_shas = set([])
+
+ check_shas(other_head_shas)
+
+ archive.start_update()
+ raised = True
+ try:
+ prev_sha = self.stored_sha
+ if truncate_manifest_history:
+ prev_sha = NULL_SHA
+
+ new_names, root_sha = self.write_changes(archive,
+ entry_infos,
+ prev_sha)
+
+ # History for all files except recently modified ones.
+ old_shas = set([])
+
+ new_shas = archive.uncommited_shas()
+
+ for value in new_names.values():
+ if value[1] in new_shas:
+ # Adding history for new values is handled by
+ # commit_update().
+ continue
+
+ # We need to explictly add history for the files which
+ # still exist in the manifest but didn't change.
+ for link in (archive.blocks.get_history(value[1])):
+ old_shas.add(link[0])
+
+ all_shas = archive.referenced_shas(old_shas.
+ union(other_head_shas))
+
+ archive.commit_update(all_shas)
+ self.stored_sha = root_sha
+ self.name_map = new_names
+ raised = False
+ finally:
+ if raised:
+ archive.abandon_update()
+
+
+def verify_manifest(archive, manifest, brief=False):
+ """ Debugging function to verify the integrity of a manifest. """
+ failures = 0
+ for name in manifest.name_map:
+ tmp = archive.blocks.tmps.make_temp_file()
+ file_sha, link_sha = manifest.name_map[name]
+ if not brief:
+ print "Verifying: %s %s => %s)" % (name,
+ str_sha(file_sha),
+ str_sha(link_sha))
+ archive.get_file(link_sha, tmp)
+ history = archive.blocks.get_history(link_sha)
+ if not brief:
+ print "History: " + " ".join([str_sha(link[0])
+ for link in history])
+
+ retrieved_sha = get_file_sha(tmp)
+ if retrieved_sha != file_sha:
+ print "Expected: %s, but got %s." % (str_sha(file_sha),
+ str_sha(retrieved_sha))
+ failures += 1
+ else:
+ if not brief:
+ print "Ok. Read %i bytes." % os.path.getsize(tmp)
+
+ archive.blocks.tmps.remove_temp_file(tmp)
+
+ if failures > 0:
+ print "%i entries failed to verify!" % failures
+ assert False
+
+def fix_backwards_slashes(name):
+ """ Helper to fix backwards slashes in windows file names. """
+ if os.sep != '\\' or name.find('\\') == -1:
+ return name
+
+ return '/'.join(name.split('\\'))
+
+class PathEntry(IManifestEntry):
+ """ IManifestEntry implementation for a path to a file on the
+ local filesystem. """
+ def __init__(self, full_path, name):
+ IManifestEntry.__init__(self)
+ self.full_path = full_path
+ self.name = fix_backwards_slashes(name)
+
+ def get_name(self):
+ """ IManifestEntry implementation. """
+ return self.name
+
+ def make_file(self):
+ """ IManifestEntry implementation. """
+ return self.full_path
+
+
+ # make_file(), release() are NOPs
+
+
+# skips empty directories
+# LATER: updates w/o sending all data?
+# only send files which have changes since
+# a local sha1 list file has changed, just send sha1s of others.
+# LATER: add accept_regex?
+def entries_from_dir(start_dir, recurse, ignore_regex=None, include_dirs=False):
+ """ An iterator which yields FileManifestEntries for
+ files in a directory. """
+ stack = [start_dir]
+ while len(stack) > 0:
+ current_dir = stack.pop()
+ names = os.listdir(current_dir)
+ for name in names:
+ if not ignore_regex is None and ignore_regex.match(name):
+ continue
+ full_path = os.path.join(current_dir, name)
+ if os.path.isdir(full_path) and recurse:
+ if include_dirs:
+ # Hack so that I can delete unreferenced dirs
+ # in manifest_to_dir
+ yield PathEntry(full_path, '')
+ stack.append(full_path)
+ if os.path.isfile(full_path):
+ name = full_path[len(start_dir):]
+ while len(name) > 0 and name.startswith(os.sep):
+ name = name[1:]
+ if len(name) > 0:
+ yield PathEntry(full_path, name)
+
+def find_dirs(name_map, target_dir):
+ """ INTERNAL: Helper function used by manifest_to_dir(). """
+
+ dirs = set([])
+ for file_name in name_map:
+ dir_name = os.path.dirname(os.path.join(target_dir, file_name))
+ if not dir_name:
+ continue # Hmmm
+ if dir_name == os.sep:
+ continue # Hmmm
+ dirs.add(dir_name)
+
+ return dirs
+
+def read_local_dir(manifest, target_dir, dirs, ignore_regex):
+ """ INTERNAL: Helper function used by manifest_to_dir(). """
+ # Read local directory state.
+ overwrite = set([])
+ remove = {} # name -> path
+ local_dirs = set([])
+ extant = set([])
+ for entry in entries_from_dir(target_dir, True, ignore_regex, True):
+ name = entry.get_name()
+ extant.add(name)
+ full_path = entry.make_file()
+ if name == '':
+ # Because we told entries_from_dir to return directories.
+ local_dirs.add(full_path)
+ continue
+
+ local_dirs.add(os.path.dirname(full_path))
+ if name in manifest.name_map:
+ overwrite.add(name)
+ else: # skip directory entries
+ remove[name] = entry.make_file()
+ entry.release()
+
+ # O(N*M) hmmm....
+ # Remove non-leaf subdirectories.
+ for stored_dir in dirs:
+ for local_dir in local_dirs.copy():
+ if stored_dir.startswith(local_dir):
+ local_dirs.remove(local_dir)
+
+
+ return (overwrite, remove, local_dirs, extant)
+
+# Hmmm... wackamole code.
+# REDFLAG: Other ways to make sleazy path references.
+def validate_path(base_dir, full_path):
+ """ Catch references to direcories above base_dir. """
+ base_dir = os.path.abspath(base_dir)
+
+ if type(full_path) is unicode:
+ raise IOError("Unicode path name: %s" % repr(full_path))
+ if not is_printable_ascii(full_path):
+ raise IOError("Non-ASCII path name: %s" % repr(full_path))
+
+ full_path = os.path.abspath(full_path)
+
+ if not (len(full_path) > len(base_dir) and
+ full_path.startswith(base_dir)):
+ raise IOError("Hinky path in manifest: %s" % full_path)
+
+# No error handling or cleanup.
+# Doubt this will work on Windows, must handle backwards path sep.
+def manifest_to_dir(archive, manifest, target_dir, ignore_regex=None,
+ dry_run=False):
+
+ """ Update files in a local directory by extracting files in a manifest.
+
+ WARNING. NOT WELL TESTED. POTENTIALLY DANGEROUS.
+ PROBABLY BROKEN ON WINDOWS. """
+
+ dirs = find_dirs(manifest.name_map, target_dir)
+
+ overwrite, remove, local_dirs, extant = \
+ read_local_dir(manifest, target_dir, dirs, ignore_regex)
+
+ remove_dirs = local_dirs - dirs
+ create = set(manifest.name_map.keys()) - extant
+ if dry_run:
+ return (create, overwrite, set(remove.keys()), remove_dirs)
+
+ # Remove files
+ for victim in remove.values():
+ if os.path.exists(victim):
+ validate_path(target_dir, victim)
+ os.remove(victim)
+
+ # Remove directories
+ for victim in (remove_dirs):
+ if os.path.exists(victim):
+ # REDFLAG: I saw this fail silently once
+ validate_path(target_dir, victim)
+ shutil.rmtree(victim)
+ assert not os.path.exists(victim)
+
+ # Make directories that exist in manifest, but not locally.
+ for dir_name in dirs:
+ if not os.path.exists(dir_name):
+ validate_path(target_dir, dir_name)
+ os.makedirs(dir_name)
+
+ # Copy files out of the archive, onto the local file system.
+ for file_name in manifest.name_map:
+ validate_path(target_dir, os.path.join(target_dir, file_name))
+ archive.get_file(manifest.name_map[file_name][1],
+ os.path.join(target_dir, file_name))
+
+ return (create, overwrite, set(remove.keys()), remove_dirs)
+
+class RawDataTupleEntry(IManifestEntry):
+ """ IManifestEntry implementation for a path to a file on the
+ local filesystem. """
+ def __init__(self, tmps, raw_tuple):
+ IManifestEntry.__init__(self)
+ self.tmps = tmps
+ self.raw_tuple = raw_tuple
+ self.full_path = None
+ def get_name(self):
+ """ IManifestEntry implementation. """
+ return self.raw_tuple[0]
+
+ def make_file(self):
+ """ IManifestEntry implementation. """
+ assert self.full_path is None
+ self.full_path = self.tmps.make_temp_file()
+ out_file = open(self.full_path, 'wb')
+ try:
+ out_file.write(self.raw_tuple[1])
+ finally:
+ out_file.close()
+
+ return self.full_path
+
+ # MUST call this.
+ def release(self):
+ """ IManifestEntry implementation. """
+ if not self.full_path is None:
+ self.tmps.remove_temp_file(self.full_path)
+ self.full_path = None
+
+ # REDFLAG: Does this really help garbage collection or just CC?
+ self.raw_tuple = None
+ self.tmps = None
+
+def entries_from_seq(tmps, sequence):
+ """ An iterator which yields FileManifestEntries from a sequence of
+ (name, raw_data) tuples.
+
+ REQUIRES: sequence not modified while iterating.
+ """
+ for value in sequence:
+ yield RawDataTupleEntry(tmps, value)
diff --git a/wormarc/hghelper.py b/wormarc/hghelper.py
new file mode 100644
--- /dev/null
+++ b/wormarc/hghelper.py
@@ -0,0 +1,38 @@
+""" Testing helper functions for using hg repos.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+from mercurial import hg, commands, ui
+
+#{'rev': '0', 'no_decode': None, 'prefix': '', 'exclude': [],
+# 'include': [], 'type': ''}
+def export_hg_repo(src_dir, dest_dir, target_rev):
+ """ Export the files in the hg repo in src_dir to dest_dir. """
+ ui_ = ui.ui()
+ repo = hg.repository(ui_, src_dir)
+ commands.archive(ui_,
+ repo,
+ dest_dir,
+ rev=target_rev,
+ prefix='' # <- needs this to work.
+ )
+ return repo['tip'].rev()
+
+
diff --git a/wormarc/linkmap.py b/wormarc/linkmap.py
new file mode 100644
--- /dev/null
+++ b/wormarc/linkmap.py
@@ -0,0 +1,202 @@
+""" A class to keep track of history links stored in a set of files.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+from binaryrep import read_link, str_sha
+
+class LinkMap(dict):
+ """ A history link hash addressable index of the history links in
+ a set of block files. """
+ def __init__(self):
+ dict.__init__(self)
+ self.files = []
+
+ def read(self, file_list, keep_data=False):
+ """ Read the index from a collection of block files. """
+ counts = [0 for dummy in range(0, len(file_list))]
+ age = 0 # Hmmmm
+ for index, name in enumerate(file_list):
+ in_stream = open(name, 'rb')
+ raised = True
+ try:
+ latest_age, count = self.read_from_stream(in_stream,
+ index, keep_data)
+ age = max(age, latest_age)
+ counts[index] = count
+ raised = False
+ finally:
+ if raised or keep_data:
+ in_stream.close()
+ else:
+ self.files.append(in_stream)
+ return age, tuple(counts)
+
+
+ def read_from_stream(self, in_stream, index, keep_data=False):
+ """ Read links from a stream. """
+ age = 0
+ count = 0
+ while True:
+ link = read_link(in_stream, keep_data, in_stream.tell(),
+ index)
+ if link is None:
+ break
+
+ age = max(age, link[1])
+ prev = list(self.get(link[0], []))
+ link = list(link) # REDFLAG: ??? tuple -> list -> tuple
+ prev.append(tuple(link))
+ self[link[0]] = tuple(prev)
+ count += 1
+
+ return age, count
+
+ # SLOW, get rid of list copy?
+ # fixups is a old_index -> new index map
+ # Omit from fixups == delete
+ def _update_block_ordinals(self, fixups):
+ """ INTERNAL: Implementation helper for update_blocks(). """
+ for sha_hash in self.keys():
+ prev = self.get(sha_hash)
+ updated = []
+ for link in prev:
+ assert link[0] == sha_hash
+ if not link[5] in fixups:
+ continue # Dropped block
+ link = list(link)
+ link[5] = fixups[link[5]]
+ updated.append(tuple(link))
+ if len(updated) > 0:
+ self[sha_hash] = tuple(updated)
+ else:
+ del self[sha_hash]
+
+ # Fixes ordinals in referenced links
+ # Drops omited blocks
+ # Closes and re-opens all file streams.
+ # Loads links from the streams in new_indices.
+ def update_blocks(self, fixups, file_list, new_indices, keep_data=False):
+ """ Update the index to track addition, deletion and reordering of
+ the underlying block files. """
+
+ assert len(self.files) == 0 # must be closed.
+ self._update_block_ordinals(fixups)
+ self.files = []
+ age = 0
+ raised = True
+ try:
+ for index, name in enumerate(file_list):
+ self.files.append(open(name, 'rb'))
+ if not index in new_indices:
+ continue
+
+ # Need to read links out of the new file.
+ latest_age, dummy = self.read_from_stream(self.files[index],
+ index, keep_data)
+ age = max(age, latest_age)
+ raised = False
+ return age
+ finally:
+ if raised:
+ self.close()
+
+ def close(self):
+ """ Close the index. """
+ for in_file in self.files:
+ in_file.close()
+ self.files = []
+
+ def get_link(self, link_sha, need_data=False):
+ """ Get a history link by its sha1 hash. """
+ links = self.get(link_sha, None)
+ if links is None:
+ raise IOError("Unresolved link: " + str_sha(link_sha))
+
+ assert len(links) > 0
+ # REDFLAG: Fully think through.
+ # The same link can exist in multiple files.
+ link = links[0]
+
+ if (not need_data) or (not link[3] is None):
+ return link
+
+ index = link[5]
+ self.files[index].seek(link[4])
+ ret = read_link(self.files[index], True)
+ if ret is None:
+ raise IOError("Couldn't read blob from disk.")
+
+ assert ret[0] == link[0]
+ assert ret[1] == link[1]
+ assert ret[2] == link[2]
+ assert not ret[3] is None
+ assert ret[0] == link_sha
+ return ret
+
+def raw_block_read(link_map, ordinal):
+ """ Read a single block file. """
+ table = {}
+ in_stream = link_map.files[ordinal]
+ in_stream.seek(0)
+ while True:
+ start_pos = in_stream.tell()
+ link = read_link(in_stream, False, start_pos, ordinal)
+ # read_link() never returns None except for eof, right?
+ # Otherwise we'd only do a partial read...
+ if link is None:
+ break
+ entry = table.get(link[0], [])
+ entry.append(link)
+ table[link[0]] = entry
+ return table
+
+def links_by_block(link_map):
+ """ INTERNAL: Implementation helper function for
+ verify_link_map(). """
+ tables = [{} for dummy in range(0, len(link_map.files))]
+ for links in link_map.values():
+ assert len(links) > 0
+ for link in links:
+ ordinal = link[5]
+ assert ordinal >= 0 and ordinal < len(link_map.files)
+ entry = tables[ordinal].get(link[0], [])
+ entry.append(link)
+ tables[ordinal][link[0]] = entry
+ return tables
+
+def verify_link_map(link_map):
+ """ Debugging function to verify the integrity of a LinkMap instance. """
+
+ assert len(link_map.files) > 0
+ count = 0
+ by_block = links_by_block(link_map)
+
+ for ordinal in range(0, len(link_map.files)):
+ raw_shas = raw_block_read(link_map, ordinal)
+ # Hashes read from the raw file are the same as
+ # the ones that the LinkMap thinks should be in the file.
+ assert frozenset(raw_shas.keys()) == frozenset(by_block[ordinal].keys())
+
+ # Now check values.
+ for link_sha in raw_shas:
+ assert (frozenset(raw_shas[link_sha]) ==
+ frozenset(by_block[ordinal][link_sha]))
+ count += 1
+ return count
diff --git a/wormarc/shafunc.py b/wormarc/shafunc.py
new file mode 100644
--- /dev/null
+++ b/wormarc/shafunc.py
@@ -0,0 +1,50 @@
+""" Deal with move of SHA1 hash lib from sha to hashlib module.
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+try:
+ #raise ImportError("fake") # Tested under 2.6 using this.
+ from hashlib import sha1 as newshafunc
+ #print "LOADED NEW"
+ def new_sha(value=None):
+ """ Make new SHA1 instance using hashlib module. """
+ if value == None:
+ return newshafunc()
+ return newshafunc(value)
+
+except ImportError:
+ # Fall back so that code still runs on pre 2.6 systems.
+ import sha as oldshamod
+ #print "LOADED OLD"
+ def new_sha(value=None):
+ """ Make new SHA1 instance using old sha module. """
+ if value == None:
+ return oldshamod.new()
+ return oldshamod.new(value)
+
+# from shafunc import new_sha
+# def main():
+# text = 'OH HAI'
+# a = new_sha()
+# a.update(text)
+# b = new_sha(text)
+# print a.hexdigest()
+# print b.hexdigest()
+
+# main()
diff --git a/wormarc/test_archive.py b/wormarc/test_archive.py
new file mode 100644
--- /dev/null
+++ b/wormarc/test_archive.py
@@ -0,0 +1,730 @@
+""" Unit tests.
+
+ Copyright (C) 2009 Darrell Karbott
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2.0 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received a copy of the GNU General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+ Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
+"""
+
+
+# OK to be a little sloppy for test code.
+# pylint: disable-msg=C0111
+# For setUp() and tearDown()
+# pylint: disable-msg=C0103
+# Allow attribute creation in setUp()
+# pylint: disable-msg=W0201
+# Allow test methods that don't reference self.
+# pylint: disable-msg=R0201
+# Allow many test methods.
+# pylint: disable-msg=R0904
+import os
+import shutil
+import traceback
+import random
+import time
+import sys
+import unittest
+
+from shafunc import new_sha as sha1
+
+from binaryrep import NULL_SHA, get_file_sha, str_sha
+from blocks import BlockStorage, ITempFileManager
+from linkmap import verify_link_map
+from filemanifest import FileManifest, entries_from_dir, entries_from_seq, \
+ manifest_to_dir, verify_manifest, validate_path
+
+from archive import WORMBlockArchive, is_ordered, is_contiguous, \
+ repartition, compress
+
+from deltacoder import DeltaCoder
+
+from hghelper import export_hg_repo
+
+# False causes test dir to be cleaned up automatically
+# after every run.
+LEAVE_TEST_DIR = False
+
+# Absolute path to some hg repository to use for
+# testing.
+# You MUST MODIFY this for test_hg_repo_torture_test() to work
+HG_REPO_DIR = ""
+# e.g.:
+#HG_REPO_DIR = os.path.expanduser("~/mess/hg_zoo/somedude")
+
+#----------------------------------------------------------#
+TEST_BASE = '/tmp/'
+TEST_ROOT = '__latest_test_run__'
+
+TMP_DIR = '__TMP__'
+TEST_DIR = 'test'
+
+
+class HandleTemps(ITempFileManager):
+ """ Delegate to handle temp file creation and deletion. """
+ def __init__(self, base_dir):
+ ITempFileManager.__init__(self)
+ self.base_dir = base_dir
+ self.callers = {}
+ def make_temp_file(self):
+ """ Return a new unique temp file name including full path. """
+ name = os.path.join(self.base_dir, "__TMP__%s" %
+ str(random.random())[2:])
+ self.callers[name] = traceback.extract_stack()
+ return name
+
+ def remove_temp_file(self, full_path):
+ """ Remove and existing temp file. """
+ if not os.path.split(full_path)[-1].startswith("__TMP__"):
+ raise IOError("Didn't create: %s" % full_path)
+
+ if not os.path.exists(full_path):
+ return
+
+ if full_path in self.callers.keys():
+ del self.callers[full_path]
+ else:
+ print "HandleTemps.remove_file() -- removing non-managed file???"
+ print full_path
+
+ os.remove(full_path)
+
+ def check_for_leaks(self):
+ for name in self.callers:
+ if not os.path.exists(name):
+ continue
+
+ print "LEAKED: ", name
+ print "FROM:"
+ print self.callers[name]
+
+ if len(os.listdir(self.base_dir)) > 0:
+ file_count = 0
+ for name in os.listdir(self.base_dir):
+ if os.path.isdir(os.path.join(self.base_dir, name)):
+ # Allow directories. e.g. __hg_repo__, __unarchived__.
+ print "HandleTemps.check_for_leaks -- ignored dir: ", name
+ continue
+ print name
+ file_count += 1
+
+ if file_count > 0:
+ raise IOError("Undeleted temp files!")
+
+def dump_blocks(blocks, msg=None, brief=False):
+ if not msg is None:
+ print msg
+ values = []
+ for index in range(0, len(blocks.tags)):
+ path = blocks.full_path(index)
+ if os.path.exists(path):
+ length = str(os.path.getsize(path))
+ else:
+ length = "no_file"
+ if brief:
+ values.append(length)
+ else:
+ values.append("%s:[%s]" % (path, length))
+
+ if brief:
+ print "blocks: " + " ".join(values)
+ else:
+ print "blocks\n" + "\n".join(values)
+
+def link_str(link):
+ return "(%s, %i, %s, data: %s, %i, %s)" % (str_sha(link[0]),
+ link[1],
+ str_sha(link[2]),
+ bool(link[3]),
+ link[4],
+ link[5])
+def dump_links(links, msg=None):
+ if not msg is None:
+ print msg
+ for link in links:
+ print link_str(link)
+
+def dump_link_map(link_map, msg=None, brief=False):
+ if not msg is None:
+ print msg
+ print "keys: ", len(link_map)
+ if brief:
+ return
+ keys = link_map.keys()
+ keys.sort()
+ for key in keys:
+ print str_sha(key)
+ dump_links(link_map[key])
+
+def dump_names_map(names_map, msg=None):
+ if not msg is None:
+ print msg
+ keys = names_map.keys()
+ keys.sort()
+ for key in keys:
+ hashes = names_map[key]
+ print "%s->(%s, %s)" % (key, str_sha(hashes[0]), str_sha(hashes[1]))
+
+def dump_archive(archive, msg=None, brief=False):
+ print "--- start archive dump ---"
+ if not msg is None:
+ print msg
+ print "age: %i max_blocks: %i" % (archive.age, archive.max_blocks)
+ dump_blocks(archive.blocks, "blocks:")
+
+ dump_link_map(archive.blocks.link_map, "link_map:", brief)
+ print "--- end ---"
+
+
+def words():
+ while True:
+ yield sha1(str(random.random())).hexdigest()[:random.randrange(1, 9)]
+
+WORD_ITR = words()
+
+def lines(count):
+ line = ""
+ while count > 0:
+ line += WORD_ITR.next()
+ line += " "
+ if len(line) > 60:
+ ret = line
+ line = ""
+ count -= 1
+ yield ret.strip()
+ return
+
+class ArchiveTestCase(unittest.TestCase):
+ def setup_test_dirs(self, base_dir, dir_name):
+ if not os.path.exists(base_dir):
+ raise IOError("Base test directory doesn't exist: %s" % base_dir)
+
+ full_path = os.path.join(base_dir, dir_name)
+ if os.path.exists(full_path):
+ raise IOError("Test directory exists: %s" % full_path)
+
+ os.makedirs(full_path)
+ self.test_root = full_path
+ self.test_dir = os.path.join(self.test_root, TEST_DIR)
+ self.tmp_dir = os.path.join(self.test_root, TMP_DIR)
+ os.makedirs(self.test_dir)
+ os.makedirs(self.tmp_dir)
+
+ def remove_test_dirs(self):
+ assert self.test_root.endswith(TEST_ROOT)
+ try:
+ self.tmps.check_for_leaks()
+ finally:
+ if not LEAVE_TEST_DIR:
+ shutil.rmtree(self.test_root)
+
+ # Caller must release temp file.
+ def write_file(self, raw):
+ file_name = self.tmps.make_temp_file()
+ out_file = open(file_name, 'wb')
+ raised = True
+ try:
+ out_file.write(raw)
+ out_file.close()
+ raised = False
+ finally:
+ out_file.close()
+ if raised:
+ self.tmps.remove_temp_file(file_name)
+
+ return file_name
+
+ def read_file(self, file_name, remove_tmp=True):
+ in_file = open(file_name, 'rb')
+ try:
+ ret = in_file.read()
+ finally:
+ in_file.close()
+ if remove_tmp:
+ self.tmps.remove_temp_file(file_name)
+ return ret
+
+
+ def setUp(self):
+ self.setup_test_dirs(TEST_BASE, TEST_ROOT)
+ self.tmps = HandleTemps(self.tmp_dir)
+
+ def tearDown(self):
+ self.remove_test_dirs()
+
+class SmokeTests(ArchiveTestCase):
+ def _testLeakATempFile(self):
+ out_file = open(self.tmps.make_temp_file(), 'wb')
+ out_file.write("OH NOES! FILZ IZ LIIKAN!!!")
+ out_file.close()
+
+ def make_empty_archive(self, block_name):
+ archive = WORMBlockArchive(DeltaCoder(), BlockStorage(self.tmps))
+
+ archive.create(self.test_dir, block_name)
+
+ return archive
+
+ def load_archive(self, block_name):
+ archive = WORMBlockArchive(DeltaCoder(), BlockStorage(self.tmps))
+ archive.load(self.test_dir, block_name)
+
+ return archive
+
+ def test_create_archive(self):
+ print
+ archive = self.make_empty_archive('A')
+ dump_archive(archive)
+
+ def test_load_archive(self):
+ print
+ self.make_empty_archive('A')
+ b = self.load_archive('A')
+ dump_archive(b)
+
+ def test_archive_write_read(self):
+ a = self.make_empty_archive('A')
+ dump_archive(a, "empty")
+
+ r0 = self.write_file("OH HAI!")
+ r1 = self.write_file("OH HAI! AGAIN")
+ r2 = self.write_file("STILL ME")
+
+ t1 = self.tmps.make_temp_file()
+ try:
+ a.start_update()
+ link0 = a.write_new_delta(NULL_SHA, r0)
+ link1 = a.write_new_delta(NULL_SHA, r1)
+ link2 = a.write_new_delta(NULL_SHA, r2)
+
+ # Write
+ a.commit_update()
+ dump_archive(a, "updated")
+
+ # Read
+ print
+ print str_sha(link0[0]), a.get_data(link0[0])
+ print str_sha(link1[0]), a.get_data(link1[0])
+ print str_sha(link2[0]), a.get_data(link2[0])
+
+ a.close()
+
+ b = self.load_archive('A')
+ dump_archive(b, "[Reloaded from disk]")
+ print
+ # Mix up order.
+ print str_sha(link1[0]), b.get_data(link1[0])
+ print str_sha(link0[0]), b.get_data(link0[0])
+ print str_sha(link2[0]), b.get_data(link2[0])
+ finally:
+ self.tmps.remove_temp_file(t1)
+ self.tmps.remove_temp_file(r0)
+ self.tmps.remove_temp_file(r1)
+ self.tmps.remove_temp_file(r2)
+ #a.abandon_update()
+
+ def test_torture_a_single_chain(self):
+ a = self.make_empty_archive('A')
+ dump_archive(a, "empty")
+
+ text = ""
+ prev = NULL_SHA
+ for iteration in range(0, 5000):
+ # Write
+ a.start_update()
+ text += str(time.time()) + '\n'
+ t2 = self.write_file(text)
+ #print "Adding to: ", str_sha(prev)
+
+ link = a.write_new_delta(prev, t2)
+ new_sha = link[0]
+ link = None
+ #print "Added: ", str_sha(new_sha), str_sha(new_parent)
+ a.commit_update()
+ self.tmps.remove_temp_file(t2)
+
+ #history = a.blocks.get_history(new_sha)
+ #history_size = sum([value[6] for value in history])
+ #print "History: ", len(history), history_size, len(text)
+ #print
+ #dump_archive(a, "updated", True)
+
+ t3 = self.tmps.make_temp_file()
+ a.get_file(new_sha, t3)
+
+ self.assertTrue(text == self.read_file(t3))
+
+ prev = new_sha
+ if iteration > 0 and iteration % 100 == 0:
+ print "iteration: ", iteration
+
+ # grrr... giving up on temp files
+ def test_single_update(self):
+ a = self.make_empty_archive('A')
+ m = FileManifest()
+ data = ( \
+ ('foo.txt', 'This is the foo file.\n'),
+ ('empty.txt', ''),
+ ('big.txt', '*' * (1024 * 128)),
+ )
+ entries = entries_from_seq(self.tmps, data)
+ m.update(a, entries)
+ dump_archive(a)
+
+ def test_multiple_updates(self):
+ a = self.make_empty_archive('A')
+ m = FileManifest()
+ data0 = ( \
+ ('foo.txt', 'This is the foo file.\n'),
+ ('empty.txt', ''),
+ ('big.txt', '*' * (1 * 128)),
+ )
+
+ print "manifest sha: ", str_sha(m.stored_sha)
+ m.update(a, entries_from_seq(self.tmps, data0))
+ print "manifest sha: ", str_sha(m.stored_sha)
+
+ dump_archive(a, "AFTER FIRST WRITE:")
+ verify_manifest(a, m)
+
+ data1 = ( \
+ ('foo.txt', 'This is the foo file.\n'),
+ ('empty.txt', ''),
+ ('big.txt', 'hello' + ('*' * (1 * 128))),
+ )
+
+ m.update(a, entries_from_seq(self.tmps, data1))
+ print "manifest sha: ", str_sha(m.stored_sha)
+ dump_archive(a)
+ verify_link_map(a.blocks.link_map)
+ verify_manifest(a, m)
+
+ def test_words(self):
+ print WORD_ITR.next()
+
+ def test_lines(self):
+ for line in lines(10):
+ print line
+
+ def test_many_updates(self):
+
+ a = self.make_empty_archive('A')
+ m = FileManifest()
+
+ files = ("A.txt", "B.txt", "C.txt")
+
+ updates = 100
+ for dummy in range(0, updates):
+ names = list(files)
+ random.shuffle(names)
+ #names = names[:random.randrange(1, len(files))]
+ data = []
+ for name in names:
+ text = ''
+ if name in m.name_map:
+ tmp = self.tmps.make_temp_file()
+ a.get_file(m.name_map[name][1], tmp)
+ text = self.read_file(tmp)
+ text += "\n".join([line for line in lines(20)])
+
+ data.append((name, text))
+
+ #print "updating:"
+ #for value in data:
+ # print value[0], len(value[1])
+
+ #print "manifest sha: ", str_sha(m.stored_sha)
+ #dump_archive(a, "BEFORE UPDATE: %i" % count, True)
+ m.update(a, entries_from_seq(self.tmps, data))
+ #print "manifest sha: ", str_sha(m.stored_sha)
+
+ #dump_archive(a, "AFTER UPDATE: %i" % count, True)
+ verify_manifest(a, m, True)
+ verify_link_map(a.blocks.link_map)
+ dump_blocks(a.blocks, None, True)
+
+ a.close()
+
+
+ def test_validate_path(self):
+ base_dir = "/tmp/test/foo"
+ validate_path(base_dir, "/tmp/test/foo/bar")
+ validate_path(base_dir, "/tmp/test/foo/baz")
+ validate_path(base_dir, "/tmp/test/foo/barf/text.dat")
+
+ try:
+ validate_path(base_dir, "/tmp/test/foo/../../../etc/passwd")
+ self.assertTrue(False)
+ except IOError, e:
+ print "Got expected exception: ", e
+
+ try:
+ validate_path(base_dir, "/tmp/test/foo/../forbidden")
+ self.assertTrue(False)
+ except IOError, e:
+ print "Got expected exception: ", e
+
+ try:
+ validate_path(base_dir,
+ u"/tmp/test/foo/f\xc3\xb6rbjuden.txt")
+ self.assertTrue(False)
+ except IOError, e:
+ print "Got expected exception: ", e
+
+ try:
+ validate_path(base_dir,
+ "/tmp/test/foo/f\xc3\xb6rbjuden.txt")
+ self.assertTrue(False)
+ except IOError, e:
+ print "Got expected exception: ", e
+
+ def test_is_contiguous(self):
+ self.assertTrue(is_contiguous( () ))
+ self.assertTrue(is_contiguous( ((0, 0, '?'), ) ))
+ self.assertTrue(is_contiguous( ((0, 0, 2), (1, 1, '?')) ))
+ self.assertTrue(is_contiguous( ((0, 1, 2), (2, 3, '?')) ))
+ self.assertFalse(is_contiguous( ((0, 0, 2), (2, 2, '?')) ))
+ self.assertFalse(is_contiguous( ((0, 1, 2), (3, 3, '?')) ))
+
+ # Trailing Zeros are ignored.
+ def test_is_ordered(self):
+ self.assertTrue(is_ordered( () ))
+ self.assertTrue(is_ordered( (('?', '?', 2),) ))
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 2)) ))
+ self.assertFalse(is_ordered( (('?', '?', 2), ('?', '?', 1)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2)) ))
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 2),
+ ('?', '?', 2)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2),
+ ('?', '?', 2)) ))
+ self.assertFalse(is_ordered( (('?', '?', 1), ('?', '?', 0),
+ ('?', '?', 2)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2),
+ ('?', '?', 3)) ))
+
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 0)) ))
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 2),
+ ('?', '?', 0)) ))
+ self.assertFalse(is_ordered( (('?', '?', 2), ('?', '?', 1),
+ ('?', '?', 0)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2),
+ ('?', '?', 0), ('?', '?', 0)) ))
+
+
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 2),
+ ('?', '?', 2),
+ ('?', '?', 0)) ))
+
+
+ self.assertTrue(is_ordered( (('?', '?', 2), ('?', '?', 2),
+ ('?', '?', 2),
+ ('?', '?', 0)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2),
+ ('?', '?', 2),
+ ('?', '?', 0)) ))
+ self.assertFalse(is_ordered( (('?', '?', 1), ('?', '?', 0),
+ ('?', '?', 2),
+ ('?', '?', 0)) ))
+ self.assertTrue(is_ordered( (('?', '?', 1), ('?', '?', 2),
+ ('?', '?', 3),
+ ('?', '?', 0)) ))
+
+ self.assertFalse(is_ordered( (('?', '?', 3), ('?', '?', 2),
+ ('?', '?', 1),
+ ('?', '?', 0)) ))
+
+ self.assertFalse(is_ordered( (('?', '?', 3), ('?', '?', 2),
+ ('?', '?', 1) )) )
+
+
+ def test_repartition(self):
+ for dummy in range(0, 1000):
+ length = random.randrange(1, 8)
+ blocks = [(index, index, random.randrange(1, 10))
+ for index in range(0, length)]
+ self.assertTrue(is_contiguous(blocks))
+ original_blocks = blocks[:]
+ #were_ordered = is_ordered(blocks)
+ #print blocks
+ repartioned = repartition(blocks)
+ #print repartioned
+ self.assertTrue(is_ordered(repartioned))
+ self.assertTrue(blocks == original_blocks)
+
+ # Can't assert this anymore.
+ # Trips when in order partitions get merged because they
+ # don't meet the multiple constraint.
+ # #self.assertTrue((were_ordered and blocks == repartioned) or
+ # ((not were_ordered) and blocks != repartioned))
+
+ self.assertTrue(is_contiguous(repartioned))
+
+
+ def updateFunc(self, blocks, change_len, max_len):
+ assert len(blocks) > 0
+ blocks = blocks[:]
+ if blocks[0][2] + change_len < 32 * 1024:
+ blocks[0] = (blocks[0][0], blocks[0][1], blocks[0][2] + change_len)
+ return blocks
+ # Add and compress
+ blocks.insert(0, (-1, -1, change_len))
+ return compress(blocks, max_len)
+
+ def histogram(self, values, bin_width):
+ table = {}
+ for value in values:
+ index = int(value/bin_width)
+ table[index] = table.get(index, 0) + 1
+
+ max_bin = max(table.keys())
+ return tuple([(index, table.get(index, 0))
+ for index in range(0, max_bin + 1)])
+
+
+ # Naive
+ # DOESN'T SIMULATE:
+ # o Dropping unreferenced chains.
+ # o GOOD: reduces total archive size
+ # o BAD: effective length of older blocks declines with time
+ # as unreferenced chains drop out. -> churn ???
+ # o variance in commit sizes
+
+ # HACKed this together fast, not sure it is correct.
+ # Looks like I'm getting a power law dist.
+ def test_simulate_updates(self):
+ max_blocks = 4
+ iterations = 10000
+ change_size = 2*1024
+ blocks = [(index, index, 0) for index in range(0, max_blocks)]
+ changes = []
+ for dummy in range(0, iterations):
+ old_blocks = blocks[:]
+ blocks = self.updateFunc(blocks, change_size, max_blocks)
+
+ if not ((is_ordered(blocks) or
+ (is_ordered(blocks[1:]) and blocks[0][2] < 32 * 1024))):
+ print blocks
+
+ self.assertTrue(is_ordered(blocks) or
+ (is_ordered(blocks[1:]) and
+ blocks[0][2] < 32 * 1024))
+
+ changed = set(old_blocks) - set(blocks)
+ for value in changed:
+ # i.e. the number of bytes we had to write
+ changes.append(value[2])
+
+ # Fix ordinals. Shouldn't matter.
+ blocks = [(index, index, blocks[index][2]) for index
+ in range(0, len(blocks))]
+
+ #hist = self.histogram(changes, 32 * 1024)
+ #for value in hist:
+ # print value[0], value[1]
+
+ changes.sort()
+ #max_insert = max(changes)
+ for percent in (50, 75, 80, 85, 90, 95, 99, 100):
+ point = changes[min(int((percent/100.0) * len(changes)),
+ len(changes) - 1)]
+ print "%i %i %i" % (percent, point, point/(32*1024 + 1))
+
+
+ def test_hg_repo_torture_test(self):
+ if HG_REPO_DIR == '':
+ print "Set HG_REPO_DIR!"
+ self.assertTrue(False)
+
+ writer = self.make_empty_archive('hgtst')
+ manifest = FileManifest()
+
+ rev = 0
+ max_rev = 1 # Set below
+ while rev < max_rev:
+ target_dir = os.path.join(self.tmp_dir, '__hg_repo__')
+ if os.path.exists(target_dir):
+ shutil.rmtree(target_dir) # DANGEROUS
+
+ # export the repo
+ # FIX: Wacky way to set max_rev.
+ print "Exporting rev: ", rev
+ max_rev = export_hg_repo(HG_REPO_DIR, target_dir, rev)
+ if rev >= max_rev:
+ break
+
+ # put the export dir into the archive
+ # print "Inserting into the archive..."
+
+ entries = entries_from_dir(target_dir, True)
+ manifest.update(writer, entries)
+
+ # Will be written into Freenet top key
+ # along with rest of archive info.
+ s3kr1t = manifest.stored_sha
+
+ dump_blocks(writer.blocks, None, True)
+ # create a second archive instance from the same block files.
+ # REDFLAG: Would this work on windoze?
+ # writer still has files open for reading.
+ reader = self.load_archive('hgtst')
+ read_manifest = FileManifest.from_archive(reader, s3kr1t)
+ # REDFLAG: audit for other places where I could do
+ # direct dict compares?
+ assert (read_manifest.name_map == manifest.name_map)
+
+ # clean the archive output dir
+ unarchived_dir = os.path.join(self.tmp_dir, '__unarchived__')
+ if os.path.exists(unarchived_dir):
+ shutil.rmtree(unarchived_dir) # DANGEROUS
+
+ os.makedirs(unarchived_dir)
+
+ # extract the archive to the cleaned files
+ manifest_to_dir(reader, read_manifest, unarchived_dir)
+ reader.close()
+
+ # diff the directories
+
+ # A poor man's diff.
+ insert_map = {}
+ for entry in entries_from_dir(target_dir, True):
+ insert_map[entry.get_name()] = get_file_sha(entry.make_file())
+ entry.release() # NOP
+
+ unarchived_map = {}
+ for entry in entries_from_dir(unarchived_dir, True):
+ unarchived_map[entry.get_name()] = (
+ get_file_sha(entry.make_file()))
+ entry.release() # NOP
+
+
+ assert len(insert_map) > 0
+ assert insert_map == unarchived_map
+ print "%i files compared equal." % len(insert_map)
+
+ rev += 1
+
+
+if __name__ == '__main__':
+ # use -v on command line to get verbose output.
+ # verbosity keyword arg not supported in 2.6?
+ if len(sys.argv) >= 2 and sys.argv[1] != '-v':
+ # Run a single test case
+ suite = unittest.TestSuite()
+ suite.addTest(SmokeTests(sys.argv[1]))
+ unittest.TextTestRunner().run(suite)
+ else:
+ # Run everything.
+ unittest.main()