> On 1 Mar 2017, at 04:11, Nick Mathewson <nickm@xxxxxxxxxxxx> wrote: > > On Tue, Feb 28, 2017 at 6:42 AM, teor <teor2345@xxxxxxxxx> wrote: > > Hi, Tim! > > This looks pretty plausible to me. Do you have time to write up a > quick&dirty python function that actually performs the smoothing? Well, it's a function, but it's not pretty, because it needs to know all the consensus bandwidth values to find the percentile to do the smoothing. > If > so, I can test it out on the January consensuses and measure the > impact on compressed diff sizes. To generate a list of bandwidths, run: cat cached-microdesc-consensus | grep ^w | cut -d= -f2 | cut -d" " -f1 > bandwidth_list To generate a list of pairs: bandwidth rounded_bandwidth Run: ./bwround.py bandwidth_list There's an alternative scheme available in the script, to use it, comment out the line: rounding_configs = excess_rounding (The alternative scheme rounds the 99th percentile to 2 significant figures rather than 3.) I'd be interested in how each scheme performs. T -- Tim Wilson-Brown (teor) teor2345 at gmail dot com PGP C855 6CED 5D90 A0C5 29F6 4D43 450C BA7F 968F 094B ricochet:ekmygaiu4rzgsk6n xmpp: teor at torproject dot org ------------------------------------------------------------------------
#!/usr/bin/python import math import bisect import sys # define error functions and midpoint functions def error_ratio(mid, minmax): '''The ratio between the rounded value and the actual value | n / r(n) - 1 | ''' return abs(minmax / mid - 1.0) def mid_min_error_ratio(l, h): '''Minimise the ratio between the rounded value and the actual value ''' # if low is 0, map everything to 0 if l == 0.0: return h return math.sqrt(l * h) def error_difference(mid, minmax): '''The difference between the rounded value and the actual value | n - r(n) | ''' return abs(minmax - mid) def mid_min_difference(l, h): '''Minimise difference between the rounded value and the actual value ''' return (l + h)/2.0 # define rounding table generation functions def rounding_table(precision, geometric = True, low = None, high = None, values = None): '''Create a table of geometric rounding values The number of distinct values is the same as that used to round to precision significant digits, or 9*10**(precision - 1) ''' # we effectively round to a number of values corresponding to 'precision' # significant digits, but we need an extra digit of precision to accurately # capture the values around the low end precision += 1.0 if low is None: low = math.pow(10.0, precision - 1.0) else: low = float(low) if high is None: high = low*10.0 else: high = float(high) # Create the same number of values as there are in the existing scheme # (an appropriate midpoint function should be used to choose whether to # round to the lower or higher value) if values is None: values = int((high - low) / 10.0) else: values = int(values) assert values > 0 bounds = [low, high] scale = (high-low)/values if values == 1: # use the values supplied: high is the lower bound of the next table bounds = [int(low), int(high)] elif not geometric: # alternative implementation: increase the values linerly between low # and high bounds = range(int(low), int(high)+1, scale) else: # increase the values geometrically between low and high step = math.pow(high/low, 1.0/values) #print step next = low/scale bounds = [] while next <= (high/scale + 1.0): bounds.append(next) next *= step # round each bound to the nearest integer bounds = [int(round(b*scale, 0)) for b in bounds] # turns out we need this to calculate the midpoint # chop off the upper bound: it belongs to the next decile #if bounds[-1] == high: # del bounds[-1] return bounds # define analysis functions midpoint = mid_min_error_ratio tests = [error_ratio, error_difference] def analyse_bounds_variance(bounds): '''Analyse the variance when rounding to the values in the bounds table ''' print len(bounds) b_prev = None for b in bounds: comparison = "{:.0f}".format(b) if b_prev is not None: for test in tests: comparison += " {:.5f}".format(test(midpoint(b_prev, b), b)) print comparison b_prev = b print bounds def calculate_rounding_tables(configs, geometric = True): '''Calculate the set of rounding values for each config in configs ''' for config in configs: table = rounding_table(config['precision'], geometric, config.get('min_value'), config.get('max_value'), config.get('values'), ) config['table'] = table #print config #analyse_bounds_variance(table) def get_rounding_config(configs, value, percentile): '''Return the relevant rounding config for value, which is at percentile ''' # first, see if value corresponds to a config without a percentile for config in configs: if config.get('max_percentile') is None: if value >= config['min_value'] and value < config['max_value']: return config # now check the configs with a percentile for config in configs: if (config.get('max_percentile') is not None and config.get('min_percentile') is not None): if (percentile >= config['min_percentile'] and percentile < config['max_percentile']): return config # the table should be comprehensive assert False def get_percentile_for_value(bandwidths, value): '''Given a list of bandwidths, find the percentile for value. ''' bandwidths.sort() # if there are several instances of value, choose the highest percentile idx = bisect.bisect_right(bandwidths,value) return (100.0 * idx) / len(bandwidths) def get_closest_entries(table, value): '''Return the pair of table entries that are closest to value ''' low = table[0] for high in table: if value >= low and value <= high: return (low, high) low = high assert False def get_scale(table, value): '''Get the ratio required to scale value within table ''' # find the bounds that we have to scale within tmin = table[0] tmax = table[-1] if tmin > 0.0: factor = tmax / tmin else: factor = tmax scale = 1.0 while True: # prefer the lower end of the table if value / scale >= tmax: scale *= factor elif value / scale < tmin: scale /= factor else: return scale def round_using_table(table, value, geometric = True): '''Round value according to table ''' # choose the midpoint function if geometric: mp = mid_min_error_ratio else: mp = mid_min_difference scale = get_scale(table, value) #print scale (low, high) = get_closest_entries(table, value/scale) mid = mp(low, high) # if it's equal to the midpoint, choose the higher value if value/scale >= mid: return int(high*scale) else: return int(low*scale) def round_value(bandwidths, rounding_configs, value): '''Round value based on its position in bandwidths and the relevent config ''' percentile = get_percentile_for_value(bandwidths, value) config = get_rounding_config(rounding_configs, value, percentile) rounded = round_using_table(config['table'], value, geometric) #print config #print value, percentile, rounded return rounded # define concrete rounding configs # a table is used from its max_percentile to the next max_percentile of # relay bandwidths, or for configs without a percentile, from [min_value, max_value) # this rounding config has excess precision for the 99th percentile, because # the input values are already rounded to 3 significant figures excess_rounding = [ # use 100.1 to include the 100th percentile { 'max_percentile': 100.1, 'min_percentile': 99.0, 'precision': 3 }, { 'max_percentile': 99.0, 'min_percentile': 90.0, 'precision': 2 }, { 'max_percentile': 90.0, 'min_percentile': 0.0, 'precision': 1 }, # max_percentile: None is used for the tail end bandwidths 1 - 100 { 'max_percentile': None, 'min_percentile': None, 'precision': 0, 'max_value': 100.0, 'min_value': 1.0, 'values': 3 }, # anything below the tail rounds to itself (that is, 0 rounds to 0) { 'max_percentile': None, 'min_percentile': None, 'precision': 0, 'max_value': 1.0, 'min_value': 0.0, 'values': 1 }, ] # this rounding config should be highly compressible, but discards some # information from the 99th percentile basic_rounding = [ # use 100.1 to include the 100th percentile { 'max_percentile': 100.1, 'min_percentile': 99.0, 'precision': 2 }, { 'max_percentile': 99.0, 'min_percentile': 90.0, 'precision': 1 }, { 'max_percentile': 90.0, 'min_percentile': 0.0, 'precision': 0, 'values': 3 }, # max_percentile: None is used for the tail end bandwidths 1 - 100 { 'max_percentile': None, 'min_percentile': None, 'precision': 0, 'max_value': 100.0, 'min_value': 1.0, 'values': 3 }, # anything below the tail rounds to itself (that is, 0 rounds to 0) { 'max_percentile': None, 'min_percentile': None, 'precision': 0, 'max_value': 1.0, 'min_value': 0.0, 'values': 1 }, ] geometric = True rounding_configs = basic_rounding rounding_configs = excess_rounding calculate_rounding_tables(rounding_configs, geometric) # create the bandwidths file using: # cat cached-microdesc-consensus | grep ^w | cut -d= -f2 | cut -d" " -f1 bandwidths = [] with open(sys.argv[1], 'r') as bandwidths_file: for line in bandwidths_file: line.strip() bandwidths.append(int(line)) # round values supplied on the command line if len(sys.argv) > 2: # round values supplied on the command line for value in sys.argv[2:]: value = int(value) rounded = round_value(bandwidths, rounding_configs, value) print value, rounded else: # round the entire file for value in bandwidths: value = int(value) rounded = round_value(bandwidths, rounding_configs, value) print value, rounded
Attachment:
signature.asc
Description: Message signed with OpenPGP
_______________________________________________ tor-dev mailing list tor-dev@xxxxxxxxxxxxxxxxxxxx https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-dev