> On 1 Mar 2017, at 04:11, Nick Mathewson <nickm@xxxxxxxxxxxx> wrote: > > On Tue, Feb 28, 2017 at 6:42 AM, teor <teor2345@xxxxxxxxx> wrote: > > Hi, Tim! > > This looks pretty plausible to me. Do you have time to write up a > quick&dirty python function that actually performs the smoothing? Well, it's a function, but it's not pretty, because it needs to know all the consensus bandwidth values to find the percentile to do the smoothing. > If > so, I can test it out on the January consensuses and measure the > impact on compressed diff sizes. To generate a list of bandwidths, run: cat cached-microdesc-consensus | grep ^w | cut -d= -f2 | cut -d" " -f1 > bandwidth_list To generate a list of pairs: bandwidth rounded_bandwidth Run: ./bwround.py bandwidth_list There's an alternative scheme available in the script, to use it, comment out the line: rounding_configs = excess_rounding (The alternative scheme rounds the 99th percentile to 2 significant figures rather than 3.) I'd be interested in how each scheme performs. T -- Tim Wilson-Brown (teor) teor2345 at gmail dot com PGP C855 6CED 5D90 A0C5 29F6 4D43 450C BA7F 968F 094B ricochet:ekmygaiu4rzgsk6n xmpp: teor at torproject dot org ------------------------------------------------------------------------
#!/usr/bin/python
import math
import bisect
import sys
# define error functions and midpoint functions
def error_ratio(mid, minmax):
'''The ratio between the rounded value and the actual value
| n / r(n) - 1 |
'''
return abs(minmax / mid - 1.0)
def mid_min_error_ratio(l, h):
'''Minimise the ratio between the rounded value and the actual value
'''
# if low is 0, map everything to 0
if l == 0.0:
return h
return math.sqrt(l * h)
def error_difference(mid, minmax):
'''The difference between the rounded value and the actual value
| n - r(n) |
'''
return abs(minmax - mid)
def mid_min_difference(l, h):
'''Minimise difference between the rounded value and the actual value
'''
return (l + h)/2.0
# define rounding table generation functions
def rounding_table(precision, geometric = True,
low = None, high = None, values = None):
'''Create a table of geometric rounding values
The number of distinct values is the same as that used to round to
precision significant digits, or 9*10**(precision - 1)
'''
# we effectively round to a number of values corresponding to 'precision'
# significant digits, but we need an extra digit of precision to accurately
# capture the values around the low end
precision += 1.0
if low is None:
low = math.pow(10.0, precision - 1.0)
else:
low = float(low)
if high is None:
high = low*10.0
else:
high = float(high)
# Create the same number of values as there are in the existing scheme
# (an appropriate midpoint function should be used to choose whether to
# round to the lower or higher value)
if values is None:
values = int((high - low) / 10.0)
else:
values = int(values)
assert values > 0
bounds = [low, high]
scale = (high-low)/values
if values == 1:
# use the values supplied: high is the lower bound of the next table
bounds = [int(low), int(high)]
elif not geometric:
# alternative implementation: increase the values linerly between low
# and high
bounds = range(int(low), int(high)+1, scale)
else:
# increase the values geometrically between low and high
step = math.pow(high/low, 1.0/values)
#print step
next = low/scale
bounds = []
while next <= (high/scale + 1.0):
bounds.append(next)
next *= step
# round each bound to the nearest integer
bounds = [int(round(b*scale, 0)) for b in bounds]
# turns out we need this to calculate the midpoint
# chop off the upper bound: it belongs to the next decile
#if bounds[-1] == high:
# del bounds[-1]
return bounds
# define analysis functions
midpoint = mid_min_error_ratio
tests = [error_ratio, error_difference]
def analyse_bounds_variance(bounds):
'''Analyse the variance when rounding to the values in the bounds table
'''
print len(bounds)
b_prev = None
for b in bounds:
comparison = "{:.0f}".format(b)
if b_prev is not None:
for test in tests:
comparison += " {:.5f}".format(test(midpoint(b_prev, b), b))
print comparison
b_prev = b
print bounds
def calculate_rounding_tables(configs, geometric = True):
'''Calculate the set of rounding values for each config in configs
'''
for config in configs:
table = rounding_table(config['precision'],
geometric,
config.get('min_value'),
config.get('max_value'),
config.get('values'),
)
config['table'] = table
#print config
#analyse_bounds_variance(table)
def get_rounding_config(configs, value, percentile):
'''Return the relevant rounding config for value, which is at percentile
'''
# first, see if value corresponds to a config without a percentile
for config in configs:
if config.get('max_percentile') is None:
if value >= config['min_value'] and value < config['max_value']:
return config
# now check the configs with a percentile
for config in configs:
if (config.get('max_percentile') is not None and
config.get('min_percentile') is not None):
if (percentile >= config['min_percentile'] and
percentile < config['max_percentile']):
return config
# the table should be comprehensive
assert False
def get_percentile_for_value(bandwidths, value):
'''Given a list of bandwidths, find the percentile for value.
'''
bandwidths.sort()
# if there are several instances of value, choose the highest percentile
idx = bisect.bisect_right(bandwidths,value)
return (100.0 * idx) / len(bandwidths)
def get_closest_entries(table, value):
'''Return the pair of table entries that are closest to value
'''
low = table[0]
for high in table:
if value >= low and value <= high:
return (low, high)
low = high
assert False
def get_scale(table, value):
'''Get the ratio required to scale value within table
'''
# find the bounds that we have to scale within
tmin = table[0]
tmax = table[-1]
if tmin > 0.0:
factor = tmax / tmin
else:
factor = tmax
scale = 1.0
while True:
# prefer the lower end of the table
if value / scale >= tmax:
scale *= factor
elif value / scale < tmin:
scale /= factor
else:
return scale
def round_using_table(table, value, geometric = True):
'''Round value according to table
'''
# choose the midpoint function
if geometric:
mp = mid_min_error_ratio
else:
mp = mid_min_difference
scale = get_scale(table, value)
#print scale
(low, high) = get_closest_entries(table, value/scale)
mid = mp(low, high)
# if it's equal to the midpoint, choose the higher value
if value/scale >= mid:
return int(high*scale)
else:
return int(low*scale)
def round_value(bandwidths, rounding_configs, value):
'''Round value based on its position in bandwidths and the relevent config
'''
percentile = get_percentile_for_value(bandwidths, value)
config = get_rounding_config(rounding_configs, value, percentile)
rounded = round_using_table(config['table'], value, geometric)
#print config
#print value, percentile, rounded
return rounded
# define concrete rounding configs
# a table is used from its max_percentile to the next max_percentile of
# relay bandwidths, or for configs without a percentile, from [min_value, max_value)
# this rounding config has excess precision for the 99th percentile, because
# the input values are already rounded to 3 significant figures
excess_rounding = [
# use 100.1 to include the 100th percentile
{ 'max_percentile': 100.1, 'min_percentile': 99.0, 'precision': 3 },
{ 'max_percentile': 99.0, 'min_percentile': 90.0, 'precision': 2 },
{ 'max_percentile': 90.0, 'min_percentile': 0.0, 'precision': 1 },
# max_percentile: None is used for the tail end bandwidths 1 - 100
{ 'max_percentile': None, 'min_percentile': None, 'precision': 0,
'max_value': 100.0, 'min_value': 1.0, 'values': 3 },
# anything below the tail rounds to itself (that is, 0 rounds to 0)
{ 'max_percentile': None, 'min_percentile': None, 'precision': 0,
'max_value': 1.0, 'min_value': 0.0, 'values': 1 },
]
# this rounding config should be highly compressible, but discards some
# information from the 99th percentile
basic_rounding = [
# use 100.1 to include the 100th percentile
{ 'max_percentile': 100.1, 'min_percentile': 99.0, 'precision': 2 },
{ 'max_percentile': 99.0, 'min_percentile': 90.0, 'precision': 1 },
{ 'max_percentile': 90.0, 'min_percentile': 0.0, 'precision': 0,
'values': 3 },
# max_percentile: None is used for the tail end bandwidths 1 - 100
{ 'max_percentile': None, 'min_percentile': None, 'precision': 0,
'max_value': 100.0, 'min_value': 1.0, 'values': 3 },
# anything below the tail rounds to itself (that is, 0 rounds to 0)
{ 'max_percentile': None, 'min_percentile': None, 'precision': 0,
'max_value': 1.0, 'min_value': 0.0, 'values': 1 },
]
geometric = True
rounding_configs = basic_rounding
rounding_configs = excess_rounding
calculate_rounding_tables(rounding_configs, geometric)
# create the bandwidths file using:
# cat cached-microdesc-consensus | grep ^w | cut -d= -f2 | cut -d" " -f1
bandwidths = []
with open(sys.argv[1], 'r') as bandwidths_file:
for line in bandwidths_file:
line.strip()
bandwidths.append(int(line))
# round values supplied on the command line
if len(sys.argv) > 2:
# round values supplied on the command line
for value in sys.argv[2:]:
value = int(value)
rounded = round_value(bandwidths, rounding_configs, value)
print value, rounded
else:
# round the entire file
for value in bandwidths:
value = int(value)
rounded = round_value(bandwidths, rounding_configs, value)
print value, rounded
Attachment:
signature.asc
Description: Message signed with OpenPGP
_______________________________________________ tor-dev mailing list tor-dev@xxxxxxxxxxxxxxxxxxxx https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-dev