[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [metrics-tasks/master] Add simple parser for bridge descriptors in python to work on ticket 2866
commit 18509bdf00646fda601581795c6f7d7800976a50
Author: Tomás Touceda <chiiph@xxxxxxxxxxxxxx>
Date: Thu May 10 19:41:09 2012 -0300
Add simple parser for bridge descriptors in python to work on ticket 2866
---
task-2866/parser.py | 313 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 313 insertions(+), 0 deletions(-)
diff --git a/task-2866/parser.py b/task-2866/parser.py
new file mode 100644
index 0000000..260120b
--- /dev/null
+++ b/task-2866/parser.py
@@ -0,0 +1,313 @@
+import os
+import datetime
+import sys
+
+from math import floor
+
+UNALLOC_POOL = "unallocated"
+
+class Bridge(object):
+ def __init__(self, fp, pool = "", params = {}):
+ object.__init__(self)
+ self._fp = fp
+ self._pool = pool
+ self._params = params
+ self._from = None
+ self._to = None
+ self._usage_per_country = {}
+ self._usage_per_day = {}
+
+ def pprint(self):
+ print self._fp, self._pool, self._params
+
+ # <40 bytes fp> <pool> <key>=<val> ...
+ @staticmethod
+ def fromLine(line):
+ fp = line[:40]
+ pool = ""
+ params = {}
+
+ rest = line[41:]
+ next_spc = rest.find(" ")
+ if next_spc == -1:
+ pool = rest
+ return Bridge(fp, pool)
+
+ pool = rest[:next_spc]
+ rest = rest[next_spc+1:]
+
+ key = val = ""
+ next_spc = rest.find(" ")
+ while next_spc != -1:
+ key, val = Bridge.parseKeyVal(rest[:next_spc])
+ params[key] = val
+ rest = rest[next_spc+1:]
+ next_spc = rest.find(" ")
+
+ return Bridge(fp, pool, params)
+
+ @staticmethod
+ def parseKeyVal(line):
+ [key, val] = line.split("=")
+ return key, val
+
+class BridgePool(object):
+ def __init__(self, path, bridges, year, month, day, hour, minute, seconds):
+ object.__init__(self)
+ self._path = path
+ self._date = (year, month, day, hour, minute, seconds)
+ self._bridges = bridges
+ self.gatherBridges()
+
+ def pprint(self):
+ print "%s/%s/%s :: %s:%s:%s" % self._date
+ for b in self._bridges:
+ b.pprint()
+
+ def gatherBridges(self):
+ f = open(self._path, "r")
+ contents = f.read().split("\n")[1:]
+ for line in contents:
+ if len(line.strip()) == 0:
+ continue
+ if line[:40] in self._bridges.keys():
+ self._bridges[line[:40]]._to = self._date
+ else:
+ b = Bridge.fromLine(line)
+ b._from = self._date
+ self._bridges[b._fp] = b
+
+def listHas(l, comp):
+ for e in l:
+ if e.startswith(comp):
+ return True
+ return False
+
+class BridgeUsageAnalyzer(object):
+ def __init__(self, fromPath = ".", bridges = {}):
+ object.__init__(self)
+ self._bridges = bridges
+ self._root = fromPath
+ self.gatherUsage()
+
+ def gatherUsage(self):
+ print "Walking dirs to gather bridge usage (you might want to go do something else for a while)..."
+ for root, dirs, files in os.walk(self._root):
+ comps = root.split(os.sep)
+ # bridge-descriptors-<year>-<month>/fp[0]/fp[1]/fp
+ if not listHas(comps, "bridge-descriptors-"):
+ continue
+ if not listHas(comps, "extra-info"):
+ continue
+ print root
+
+ for f in files:
+ f_value = os.path.join(root, f)
+ try:
+ bfp, ips, day = self.parseExtraInfo(f_value)
+ if not bfp in self._bridges.keys():
+ #print "WARNING: %s is not in the bridge pool" % (f,)
+ continue
+ daily_usage = 0
+ for c in ips.keys():
+ if not c in self._bridges[bfp]._usage_per_country.keys():
+ self._bridges[bfp]._usage_per_country[c] = 0
+ self._bridges[bfp]._usage_per_country[c] += ips[c]
+ daily_usage += ips[c]
+ if not day in self._bridges[bfp]._usage_per_day.keys():
+ self._bridges[bfp]._usage_per_day[day] = 0
+ self._bridges[bfp]._usage_per_day[day] += daily_usage
+ except KeyboardInterrupt, e:
+ sys.exit(1)
+ except Exception, e:
+ print e
+ print "ERROR: Something was wrong while processing %s" % (f_value,)
+
+ def parseExtraInfo(self, path):
+ contents = open(path, "r").read()
+ bfp = ""
+ ips = {}
+ day = ""
+ for l in contents.split("\n"):
+ if l.startswith("extra-info Unnamed"):
+ bfp = l.split(" ")[-1].lower()
+ continue
+ if l.startswith("published"):
+ day = l.split(" ")[1]
+ if l.startswith("bridge-ips"):
+ bridgeips = l.split(",")[1:]
+ try:
+ for ci in bridgeips:
+ parts = ci.split("=")
+ ips[parts[0]] = int(parts[1])
+ except:
+ break
+
+ return bfp, ips, day
+
+ # Question: is there any correlation between the usage by country
+ # and the pool the bridge is from?
+ def analyzeUsage(self):
+ alloc_users = 0
+ unalloc_users = 0
+
+ alloc_usage = {}
+ unalloc_usage = {}
+ for bfp in self._bridges.keys():
+ acc = 0
+ for c in self._bridges[bfp]._usage_per_country.keys():
+ acc = self._bridges[bfp]._usage_per_country[c]
+ if self._bridges[bfp]._pool == UNALLOC_POOL:
+ unalloc_users += acc
+ if not c in unalloc_usage.keys():
+ unalloc_usage[c] = 0
+ unalloc_usage[c] += acc
+ else:
+ alloc_users += acc
+ if not c in alloc_usage.keys():
+ alloc_usage[c] = 0
+ alloc_usage[c] += acc
+
+ print "Total allocated pool users: %d" % (alloc_users,)
+ print "Total unallocated pool users: %d" % (unalloc_users,)
+
+ alloc_items = alloc_usage.items()
+ alloc_sorted_items = sorted(alloc_items, key=lambda use: use[1])
+ print "Country usage for allocated pool:"
+ for country, count in alloc_sorted_items:
+ print " %s = %d" % (country, count)
+
+ unalloc_items = unalloc_usage.items()
+ unalloc_sorted_items = sorted(unalloc_items, key=lambda use: use[1])
+ print "Country usage for unallocated pool:"
+ for country, count in unalloc_sorted_items:
+ print " %s = %d" % (country, count)
+
+ total_avg_unalloc = []
+ total_avg_alloc = []
+ for bfp in self._bridges.keys():
+ avg = 0.0
+ for day in self._bridges[bfp]._usage_per_day.keys():
+ avg += self._bridges[bfp]._usage_per_day[day]
+ if avg == 0.0:
+ continue
+ avg = float(avg)/len(self._bridges[bfp]._usage_per_day.keys())
+ if self._bridges[bfp]._pool == UNALLOC_POOL:
+ total_avg_unalloc.append(avg)
+ else:
+ total_avg_alloc.append(avg)
+ #print "Average for %s: %d" % (bfp, avg)
+ total_avg_unalloc.sort()
+ total_avg_alloc.sort()
+
+ print "Unallocated pool:"
+ print " Average clients per day: %d" % (float(sum(total_avg_unalloc))/len(total_avg_unalloc),)
+ print " Median: %d" % (total_avg_unalloc[int(round(len(total_avg_unalloc)/2.0))],)
+ print " 75th percentile: %d" % (total_avg_unalloc[int(round(len(total_avg_unalloc)*0.75))],)
+ print " 90th percentile: %d" % (total_avg_unalloc[int(round(len(total_avg_unalloc)*0.9))],)
+
+ print "Allocated pool:"
+ print " Average clients per day: %d" % (float(sum(total_avg_alloc))/len(total_avg_alloc),)
+ print " Median: %d" % (total_avg_alloc[int(round(len(total_avg_alloc)/2.0))],)
+ print " 75th percentile: %d" % (total_avg_alloc[int(round(len(total_avg_alloc)*0.75))],)
+ print " 90th percentile: %d" % (total_avg_alloc[int(round(len(total_avg_alloc)*0.9))],)
+
+class BridgePoolAnalyzer(object):
+ year = 0
+ month = 1
+ day = 2
+ hour = 3
+ minute = 4
+ seconds = 5
+
+ def __init__(self, fromPath = "."):
+ object.__init__(self)
+ self._root = fromPath
+ self._pools = []
+ self._bridges = {}
+ self.gatherBridgePools()
+
+ def pprint(self):
+ for bp in self._pools:
+ bp.pprint()
+
+ def gatherBridgePools(self):
+ print "Walking dirs to gather bridge pools (you might want to go do something else for a while)..."
+ for root, dirs, files in os.walk(self._root):
+ comps = root.split(os.sep)
+ if not listHas(comps,"bridge-pool-assignments"):
+ continue
+ print root
+
+ for f in files:
+ f_value = os.path.join(root, f)
+ date = f.split("-")
+ try:
+ self._pools.append(BridgePool(f_value, self._bridges,
+ date[self.year], date[self.month], date[self.day],
+ date[self.hour], date[self.minute], date[self.seconds]))
+ except KeyboardInterrupt, e:
+ sys.exit(1)
+ except Exception, e:
+ print e
+ print "ERROR: Something was wrong while processing %s" % (f_value,)
+
+ # Question: how much do bridges last when they are unassigned?
+ def bridgeAvailability(self):
+ print "Analyzing the data..."
+ availability_unalloc = {}
+ availability_else = {}
+
+ for bfp in self._bridges.keys():
+ b = self._bridges[bfp]
+ if b._to is None:
+ b._to = self._pools[-1]._date
+ if b._pool == UNALLOC_POOL:
+ availability_unalloc[b._fp] = (datetime.datetime(int(b._to[0]), int(b._to[1]), int(b._to[2]),
+ int(b._to[3]), int(b._to[4]), int(b._to[5])) -
+ datetime.datetime(int(b._from[0]), int(b._from[1]), int(b._from[2]),
+ int(b._from[3]), int(b._from[4]), int(b._from[5]))).total_seconds() / (60.0 * 60.0 * 24.0)
+ else:
+ availability_else[b._fp] = (datetime.datetime(int(b._to[0]), int(b._to[1]), int(b._to[2]),
+ int(b._to[3]), int(b._to[4]), int(b._to[5])) -
+ datetime.datetime(int(b._from[0]), int(b._from[1]), int(b._from[2]),
+ int(b._from[3]), int(b._from[4]), int(b._from[5]))).total_seconds() / (60.0 * 60.0 * 24.0)
+
+ print "Allocated bridges:"
+ doall(availability_else)
+
+ print "Unallocated bridges:"
+ doall(availability_unalloc)
+
+def calc_average(array):
+ avg = 0.0
+ for k in array.keys():
+ avg += float(array[k])
+ avg /= len(array.keys())
+ return avg
+
+def calc_median(values):
+ return values[int(floor(len(values)/2.0))]
+
+def calc_percentile(values, perc):
+ pos = int(round(perc/100.0 * float(len(values))))
+ return values[pos]
+
+def doall(array):
+ print " Average availability:", calc_average(array)
+ values = array.values()
+ values.sort()
+ print " Median availability:", calc_median(values)
+ print " 75th percentile availability:", calc_percentile(values, 75)
+ print " 90th percentile availability:", calc_percentile(values, 90)
+
+if __name__ == "__main__":
+ path = "."
+ if len(sys.argv) > 1:
+ path = sys.argv[1]
+ bpa = BridgePoolAnalyzer(path)
+ #bpa.pprint()
+ bpa.bridgeAvailability()
+
+ bua = BridgeUsageAnalyzer(path, bpa._bridges)
+ bua.analyzeUsage()
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits