infocalypse: infocalypse/choose.py

""" Functions to choose which bundle to fetch next.

    Copyright (C) 2009 Darrell Karbott

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public
    License as published by the Free Software Foundation; either
    version 2.0 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General Public License for more details.

    You should have received a copy of the GNU General Public
    License along with this library; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

    Author: djk@isFiaD04zgAgnrEC5XJt1i4IE7AkNPqhBG5bONi6Yks
"""

import random

from graph import MAX_PATH_LEN, block_cost, print_list

# This is the maximum allowed ratio of allowed path block cost
# to minimum full update block cost.
# It is used in low_block_cost_edges() to determine when a
# path is too expensive to include.
MAX_COST_RATIO = 2.0

def step_contains(step, index):
    """ Returns True if step contains index. """
    return index > step[0] and index <= step[1]

# REDFLAG: dog slow for long lists (worse than O(n^2))
def shuffle_edge_redundancy(graph, first, second, known):
    """ INTERNAL: Shuffle the redundancy for redundant edges in
        the values returned by get_update_edges.  """
    # Kick it Pascal style.
    def shuffle_one(graph, current, other, known):
        """ INTERNAL: shuffle redundancy for a single edge list.  """
        for index, edge in enumerate(current):
            if not graph.is_redundant(edge):
                continue # Not redundant

            new_ordinal = random.choice((0, 1))
            if new_ordinal == edge[2]:
                # NOP is a valid choice.
                continue

            alternate_edge = (edge[0], edge[1], new_ordinal)
            if alternate_edge in known: # fast
                # Already queued
                continue

            try:
                pos = other.index(alternate_edge) # slow
            except ValueError:
                pos = -1

            if pos != -1:
                # If already in other,  swap
                #print "shuffle_one -- swapped with other list %s -> %s" % \
                #      (str(current[index]), str(other[pos]))
                tmp = other[pos]
                other[pos] = current[index]
                current[index] = tmp

                continue
            try:
                pos = current.index(alternate_edge) # slow
            except ValueError:
                pos = -1

            if pos != -1:
                # If already in current
                #print "shuffle_one -- swapped in same list %s -> %s" % \
                #      (str(current[index]), str(current[pos]))
                current[pos] = current[index]
            #else:
            #    print "shuffle_one -- flipped  %s -> %s" % \
            #          (str(current[index]), str(alternate_edge))

            current[index] = alternate_edge

    assert len(set(first).intersection(known)) == 0
    assert len(set(second).intersection(known)) == 0

    # #ifdef _DEBUG_?  only used to check invariants.
    first_len = len(first)
    second_len = len(second)

    shuffle_one(graph, first, second, known)
    shuffle_one(graph, second, first, known)

    assert len(set(first).intersection(known)) == 0
    assert len(set(second).intersection(known)) == 0
    assert len(first) == first_len
    assert len(second) == second_len


# Returns the number of edges which contain the index.
def contained_edge_count(edges, index, max_count=None):
    """ INTERNAL: Helper function returns the number of edges
        which contain index. """
    count = 0
    for step in edges:
        assert not step is None
        if step_contains(step, index):
            count += 1
            if not max_count is None and count >= max_count:
                return count
    return count

# ORDER: Best candidates first.
# INTENT:
# Inserter does 2 33K updates. The last one causes a "rollup"
# of the entire multi-megabyte repo into one CHK.
# This code is intended to make sure that we try fetching the
# first 33k update  before the rollup CHK even though it means
# fetching more keys.
def low_block_cost_edges(graph, known_edges, from_index, allowed):
    """ INTERNAL:  Returns the best update edges that aren't too big. """
    # MAX_PATH_LEN - 1.  If it takes more steps you should be using
    # a canonical path.
    paths = graph.enumerate_update_paths(from_index + 1,
                                         graph.latest_index, MAX_PATH_LEN - 1)
    if len(paths) == 0:
        return

    first = []

    with_cost = []
    for path in paths:
        total = 0
        for step in path:
            total += block_cost(graph.insert_length(step))
        with_cost.append((total, path))
    with_cost.sort()
    #for item in with_cost:
    #    print "COST: ", item[0], item

    # Ignore paths which are too much bigger than the shortest path.
    allowed_cost = int(with_cost[0][0] * MAX_COST_RATIO)
    #print "get_update_edges -- min block cost: ", \
    #   with_cost[0][0], allowed_cost
    # First steps of the paths with a cost <= allowed_cost
    first_steps = [[value[1][0], ] for value in with_cost if value[0]
                   <= allowed_cost]
    #print "FIRST_STEPS: ", first_steps
    first_steps.sort(graph.cmp_recency)
    for path in first_steps:
        assert len(path) == 1
        step = path[0]
        if step in known_edges:
            continue
        first.append(step)
        known_edges.add(step)
        allowed -= 1
        if allowed <= 0:
            break
    return first

# ORDER: Most canonical first.
# Best candidates at the end of the list
def canonical_path_edges(graph, known_edges, from_index, allowed):
    """ INTERNAL: Returns edges containing from_index from canonical paths. """
    # Steps from canonical paths
    paths = graph.canonical_paths(graph.latest_index, MAX_PATH_LEN)
    second = []
    #print "get_update_edges -- after"
    for path in paths:
        # We need the tmp gook because the edges can overlap
        # and we want the most recent ones.
        tmp = []
        for step in path:
            assert not step is None
            if (step_contains(step, from_index + 1) and
                not step in known_edges):
                tmp.append([step, ])

        tmp.sort(graph.cmp_recency)
        for tmp_path in tmp:
            assert len(tmp_path) == 1
            assert not tmp_path[0] is None
            second.append(tmp_path[0])
            known_edges.add(tmp_path[0])
            allowed -= 1
            if allowed <= 0:
                return second

    return second


# STEP BACK:
# This function answers two questions:
# 0) What should I request to update as quickly as possible?
# A: Steps from paths <= MAX_COST_RATIO * (minimal block cost)
#    if there are any.
# 1) What should I request if that doesn't work.
# A: Most canonical.
#    Then backfill by recency.

# Will I always be able to fully enumerate search paths? or too slow

# REDFLAG: g'ter done code. Simplify and re-examine efficiency.
# REDFLAG: rename redundancy. redundancy == 2 -> 2 paths, NOT 3

# Returns (first_choice_steps, second_choice_steps)
def get_update_edges(graph, from_index, redundancy, shuffle_redundancy=False,
                     known_edges=None):
    """ Gets edges not already in known edges which could be used to
        update (pull). """

    if known_edges is None:
        known_edges = set([])

    assert not None in known_edges

    allowed = redundancy - contained_edge_count(known_edges, from_index + 1,
                                                redundancy)
    if allowed <= 0:
        # Bail out if we already have enough edges.
        return ([], [])

    original_known = known_edges
    known_edges = known_edges.copy()

    # 0) First get some low block cost paths.
    # Hmmm... make allowed cheap edges a parameter
    first = low_block_cost_edges(graph, known_edges, from_index,
                                 min(2, allowed))

    allowed -= len(first)
    second = []
    if allowed > 0:
        # 1) Then get edges from canonical path
        second = canonical_path_edges(graph, known_edges,
                                      from_index, allowed)

        # Resort by recency.
        second_paths = [[edge, ] for edge in second]
        second_paths.sort(graph.cmp_recency)
        second = [path[0] for path in second_paths]

        allowed -= len(second)

    if allowed > 0:
        # 2) Finally backfill with most recent other edges which
        # advance us at least one step.
        containing_paths = [[edge, ] for edge in
                            graph.contain(from_index + 1)
                            if edge not in known_edges]

        containing_paths.sort(graph.cmp_recency)

        for path in containing_paths:
            second.insert(0, path[0])
            known_edges.add(path[0])
            allowed -= 1
            if allowed <= 0:
                break

    # Hmmmm... previously I was always sorting second by recency.
    if shuffle_redundancy:
        shuffle_edge_redundancy(graph, first, second, original_known)

    #print "get_update_edges -- exiting", len(first), len(second)
    return (first, list(second))

def dump_update_edges(first, second, all_edges):
    """ Debugging function to print update edges. """
    print "--- update edges --- "
    print_list("known edges  :", all_edges)
    print_list("first choice :", first)
    print_list("second choice:", second)
    print "---"