[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r14318: When we remove old routers, use Bloom filters rather than a (in tor/trunk: . src/common src/or)
Author: nickm
Date: 2008-04-08 13:06:41 -0400 (Tue, 08 Apr 2008)
New Revision: 14318
Modified:
tor/trunk/
tor/trunk/ChangeLog
tor/trunk/src/common/container.c
tor/trunk/src/common/container.h
tor/trunk/src/or/routerlist.c
tor/trunk/src/or/test.c
Log:
r19233@catbus: nickm | 2008-04-08 13:06:34 -0400
When we remove old routers, use Bloom filters rather than a digestmap-based set in order to tell which ones we absolutely need to keep. This will save us roughly a kazillion little short-lived allocations for hash table entries.
Property changes on: tor/trunk
___________________________________________________________________
svk:merge ticket from /tor/trunk [r19233] on 8246c3cf-6607-4228-993b-4d95d33730f1
Modified: tor/trunk/ChangeLog
===================================================================
--- tor/trunk/ChangeLog 2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/ChangeLog 2008-04-08 17:06:41 UTC (rev 14318)
@@ -51,6 +51,10 @@
- New --hush command-line option similar to --quiet. While --quiet
disables all logging to the console on startup, --hush limits the
output to messages of warning and error severity.
+ - Use a Bloom filter rather than a digest-based set to track which
+ descriptors we need to keep around when we're cleaning out old
+ router descriptors. This speeds up the computation significantly, and
+ may reduce fragmentation.
o Code simplifications and refactoring:
- Refactor code using connection_ap_handshake_attach_circuit() to
Modified: tor/trunk/src/common/container.c
===================================================================
--- tor/trunk/src/common/container.c 2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/common/container.c 2008-04-08 17:06:41 UTC (rev 14318)
@@ -1192,3 +1192,23 @@
IMPLEMENT_ORDER_FUNC(find_nth_uint32, uint32_t)
IMPLEMENT_ORDER_FUNC(find_nth_long, long)
+/** Return a newly allocated digestset_t, optimized to hold a total of
+ * <b>max_elements</b> digests with a reasonably low false positive weight. */
+digestset_t *
+digestset_new(int max_elements)
+{
+ int n_bits = 1u << (tor_log2(max_elements)+5);
+ digestset_t *r = tor_malloc(sizeof(digestset_t));
+ r->mask = n_bits - 1;
+ r->ba = bitarray_init_zero(n_bits);
+ return r;
+}
+
+/** Free all storage held in <b>set</b>. */
+void
+digestset_free(digestset_t *set)
+{
+ bitarray_free(set->ba);
+ tor_free(set);
+}
+
Modified: tor/trunk/src/common/container.h
===================================================================
--- tor/trunk/src/common/container.h 2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/common/container.h 2008-04-08 17:06:41 UTC (rev 14318)
@@ -564,6 +564,49 @@
return b[bit >> BITARRAY_SHIFT] & (1u << (bit & BITARRAY_MASK));
}
+/** A set of digests, implemented as a Bloom filter. */
+typedef struct {
+ int mask; /* One less than the number of bits in <b>ba</b>; always one less
+ * than a power of two. */
+ bitarray_t *ba; /* A bit array to implement the Bloom filter. */
+} digestset_t;
+
+#define BIT(n) ((n) & set->mask)
+/** Add the digest <b>digest</b> to <b>set</b>. */
+static INLINE void
+digestset_add(digestset_t *set, const char *digest)
+{
+ const uint32_t *p = (const uint32_t *)digest;
+ const uint32_t d1 = p[0] + (p[1]>>16);
+ const uint32_t d2 = p[1] + (p[2]>>16);
+ const uint32_t d3 = p[2] + (p[3]>>16);
+ const uint32_t d4 = p[3] + (p[0]>>16);
+ bitarray_set(set->ba, BIT(d1));
+ bitarray_set(set->ba, BIT(d2));
+ bitarray_set(set->ba, BIT(d3));
+ bitarray_set(set->ba, BIT(d4));
+}
+
+/** If <b>digest</b> is in <b>set</b>, return nonzero. Otherwise,
+ * <em>probably</em> return zero. */
+static INLINE int
+digestset_isin(const digestset_t *set, const char *digest)
+{
+ const uint32_t *p = (const uint32_t *)digest;
+ const uint32_t d1 = p[0] + (p[1]>>16);
+ const uint32_t d2 = p[1] + (p[2]>>16);
+ const uint32_t d3 = p[2] + (p[3]>>16);
+ const uint32_t d4 = p[3] + (p[0]>>16);
+ return bitarray_is_set(set->ba, BIT(d1)) &&
+ bitarray_is_set(set->ba, BIT(d2)) &&
+ bitarray_is_set(set->ba, BIT(d3)) &&
+ bitarray_is_set(set->ba, BIT(d4));
+}
+#undef BIT
+
+digestset_t *digestset_new(int max_elements);
+void digestset_free(digestset_t* set);
+
/* These functions, given an <b>array</b> of <b>n_elements</b>, return the
* <b>nth</b> lowest element. <b>nth</b>=0 gives the lowest element;
* <b>n_elements</b>-1 gives the highest; and (<b>n_elements</b>-1) / 2 gives
Modified: tor/trunk/src/or/routerlist.c
===================================================================
--- tor/trunk/src/or/routerlist.c 2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/or/routerlist.c 2008-04-08 17:06:41 UTC (rev 14318)
@@ -2957,7 +2957,7 @@
static void
routerlist_remove_old_cached_routers_with_id(time_t now,
time_t cutoff, int lo, int hi,
- digestmap_t *retain)
+ digestset_t *retain)
{
int i, n = hi-lo+1;
unsigned n_extra, n_rmv = 0;
@@ -2974,10 +2974,9 @@
tor_assert(!memcmp(ident, r->identity_digest, DIGEST_LEN));
}
#endif
-
/* Check whether we need to do anything at all. */
{
- int mdpr = directory_caches_dir_info(get_options()) ? 5 : 2;
+ int mdpr = directory_caches_dir_info(get_options()) ? 2 : 1;
if (n <= mdpr)
return;
n_extra = n - mdpr;
@@ -2993,7 +2992,7 @@
signed_descriptor_t *r_next;
lifespans[i-lo].idx = i;
if (r->last_listed_as_valid_until >= now ||
- (retain && digestmap_get(retain, r->signed_descriptor_digest))) {
+ (retain && digestset_isin(retain, r->signed_descriptor_digest))) {
must_keep[i-lo] = 1;
}
if (i < hi) {
@@ -3049,10 +3048,11 @@
time_t cutoff;
routerinfo_t *router;
signed_descriptor_t *sd;
- digestmap_t *retain;
+ digestset_t *retain;
int caches = directory_caches_dir_info(get_options());
const networkstatus_t *consensus = networkstatus_get_latest_consensus();
const smartlist_t *networkstatus_v2_list = networkstatus_get_v2_list();
+ int n_expected_retain = 0;
trusted_dirs_remove_old_certs();
@@ -3061,7 +3061,18 @@
// routerlist_assert_ok(routerlist);
- retain = digestmap_new();
+ n_expected_retain = smartlist_len(consensus->routerstatus_list);
+ if (caches &&
+ networkstatus_v2_list && smartlist_len(networkstatus_v2_list)) {
+ SMARTLIST_FOREACH(networkstatus_v2_list, networkstatus_v2_t *, ns,
+ n_expected_retain += smartlist_len(ns->entries));
+ /*XXXX021 too much magic. */
+ n_expected_retain /= (smartlist_len(networkstatus_v2_list)/2+1);
+ }
+ //log_notice(LD_DIR,"n_expected_retain=%d",n_expected_retain);
+
+ retain = digestset_new(n_expected_retain);
+
cutoff = now - OLD_ROUTER_DESC_MAX_AGE;
/* Build a list of all the descriptors that _anybody_ lists. */
if (caches) {
@@ -3077,7 +3088,7 @@
* system will obsolete this whole thing in 0.2.0.x. */
SMARTLIST_FOREACH(ns->entries, routerstatus_t *, rs,
if (rs->published_on >= cutoff)
- digestmap_set(retain, rs->descriptor_digest, (void*)1));
+ digestset_add(retain, rs->descriptor_digest));
});
}
@@ -3085,13 +3096,13 @@
if (consensus) {
SMARTLIST_FOREACH(consensus->routerstatus_list, routerstatus_t *, rs,
if (rs->published_on >= cutoff)
- digestmap_set(retain, rs->descriptor_digest, (void*)1));
+ digestset_add(retain, rs->descriptor_digest));
}
- /* If we have a bunch of networkstatuses, we should consider pruning current
- * routers that are too old and that nobody recommends. (If we don't have
- * enough networkstatuses, then we should get more before we decide to kill
- * routers.) */
+ /* If we have nearly as many networkstatuses as we want, we should consider
+ * pruning current routers that are too old and that nobody recommends. (If
+ * we don't have enough networkstatuses, then we should get more before we
+ * decide to kill routers.) */
if (!caches ||
smartlist_len(networkstatus_v2_list) > get_n_v2_authorities() / 2) {
cutoff = now - ROUTER_MAX_AGE;
@@ -3100,7 +3111,8 @@
router = smartlist_get(routerlist->routers, i);
if (router->cache_info.published_on <= cutoff &&
router->cache_info.last_listed_as_valid_until < now &&
- !digestmap_get(retain,router->cache_info.signed_descriptor_digest)) {
+ !digestset_isin(retain,
+ router->cache_info.signed_descriptor_digest)) {
/* Too old: remove it. (If we're a cache, just move it into
* old_routers.) */
log_info(LD_DIR,
@@ -3120,7 +3132,7 @@
sd = smartlist_get(routerlist->old_routers, i);
if (sd->published_on <= cutoff &&
sd->last_listed_as_valid_until < now &&
- !digestmap_get(retain, sd->signed_descriptor_digest)) {
+ !digestset_isin(retain, sd->signed_descriptor_digest)) {
/* Too old. Remove it. */
routerlist_remove_old(routerlist, sd, i--);
}
@@ -3128,11 +3140,9 @@
//routerlist_assert_ok(routerlist);
- log_info(LD_DIR, "We have %d live routers and %d old router descriptors. "
- "At most %d must be retained because of networkstatuses.",
+ log_info(LD_DIR, "We have %d live routers and %d old router descriptors.",
smartlist_len(routerlist->routers),
- smartlist_len(routerlist->old_routers),
- digestmap_size(retain));
+ smartlist_len(routerlist->old_routers));
/* Now we might have to look at routerlist->old_routers for extraneous
* members. (We'd keep all the members if we could, but we need to save
@@ -3141,9 +3151,10 @@
* total number doesn't approach max_descriptors_per_router()*len(router).
*/
if (smartlist_len(routerlist->old_routers) <
- smartlist_len(routerlist->routers) * (caches?4:2))
+ smartlist_len(routerlist->routers))
goto done;
+ /* Sort by identity, then fix indices. */
smartlist_sort(routerlist->old_routers, _compare_old_routers_by_identity);
/* Fix indices. */
for (i = 0; i < smartlist_len(routerlist->old_routers); ++i) {
@@ -3171,7 +3182,7 @@
//routerlist_assert_ok(routerlist);
done:
- digestmap_free(retain, NULL);
+ digestset_free(retain);
}
/** We just added a new set of descriptors. Take whatever extra steps
Modified: tor/trunk/src/or/test.c
===================================================================
--- tor/trunk/src/or/test.c 2008-04-08 17:06:38 UTC (rev 14317)
+++ tor/trunk/src/or/test.c 2008-04-08 17:06:41 UTC (rev 14318)
@@ -1922,6 +1922,40 @@
bitarray_free(ba);
}
+static void
+test_util_digestset(void)
+{
+ smartlist_t *included = smartlist_create();
+ char d[DIGEST_LEN];
+ int i;
+ int ok = 1;
+ int false_positives = 0;
+ digestset_t *set;
+
+ for (i = 0; i < 1000; ++i) {
+ crypto_rand(d, DIGEST_LEN);
+ smartlist_add(included, tor_memdup(d, DIGEST_LEN));
+ }
+ set = digestset_new(1000);
+ SMARTLIST_FOREACH(included, const char *, cp,
+ if (digestset_isin(set, cp))
+ ok = 0);
+ test_assert(ok);
+ SMARTLIST_FOREACH(included, const char *, cp,
+ digestset_add(set, cp));
+ SMARTLIST_FOREACH(included, const char *, cp,
+ if (!digestset_isin(set, cp))
+ ok = 0);
+ test_assert(ok);
+ for (i = 0; i < 1000; ++i) {
+ crypto_rand(d, DIGEST_LEN);
+ if (digestset_isin(set, d))
+ ++false_positives;
+ }
+ test_assert(false_positives < 50); /* Should be far lower. */
+ digestset_free(set);
+}
+
/* stop threads running at once. */
static tor_mutex_t *_thread_test_mutex = NULL;
/* make sure that threads have to run at the same time. */
@@ -3362,6 +3396,69 @@
}
static void
+bench_dmap(void)
+{
+ smartlist_t *sl = smartlist_create();
+ smartlist_t *sl2 = smartlist_create();
+ struct timeval start, end, pt2, pt3, pt4;
+ const int iters = 10000;
+ const int elts = 4000;
+ const int fpostests = 1000000;
+ char d[20];
+ int i,n=0, fp = 0;
+ digestmap_t *dm = digestmap_new();
+ digestset_t *ds = digestset_new(elts);
+
+ for (i = 0; i < elts; ++i) {
+ crypto_rand(d, 20);
+ smartlist_add(sl, tor_memdup(d, 20));
+ }
+ for (i = 0; i < elts; ++i) {
+ crypto_rand(d, 20);
+ smartlist_add(sl2, tor_memdup(d, 20));
+ }
+ printf("nbits=%d\n", ds->mask+1);
+
+ tor_gettimeofday(&start);
+ for (i = 0; i < iters; ++i) {
+ SMARTLIST_FOREACH(sl, const char *, cp, digestmap_set(dm, cp, (void*)1));
+ }
+ tor_gettimeofday(&pt2);
+ for (i = 0; i < iters; ++i) {
+ SMARTLIST_FOREACH(sl, const char *, cp, digestmap_get(dm, cp));
+ SMARTLIST_FOREACH(sl2, const char *, cp, digestmap_get(dm, cp));
+ }
+ tor_gettimeofday(&pt3);
+ for (i = 0; i < iters; ++i) {
+ SMARTLIST_FOREACH(sl, const char *, cp, digestset_add(ds, cp));
+ }
+ tor_gettimeofday(&pt4);
+ for (i = 0; i < iters; ++i) {
+ SMARTLIST_FOREACH(sl, const char *, cp, n += digestset_isin(ds, cp));
+ SMARTLIST_FOREACH(sl2, const char *, cp, n += digestset_isin(ds, cp));
+ }
+ tor_gettimeofday(&end);
+
+ for (i = 0; i < fpostests; ++i) {
+ crypto_rand(d, 20);
+ if (digestset_isin(ds, d)) ++fp;
+ }
+
+ printf("%ld\n",(unsigned long)tv_udiff(&start, &pt2));
+ printf("%ld\n",(unsigned long)tv_udiff(&pt2, &pt3));
+ printf("%ld\n",(unsigned long)tv_udiff(&pt3, &pt4));
+ printf("%ld\n",(unsigned long)tv_udiff(&pt4, &end));
+ printf("-- %d\n", n);
+ printf("++ %f\n", fp/(double)fpostests);
+ digestmap_free(dm, NULL);
+ digestset_free(ds);
+ SMARTLIST_FOREACH(sl, char *, cp, tor_free(cp));
+ SMARTLIST_FOREACH(sl2, char *, cp, tor_free(cp));
+ smartlist_free(sl);
+ smartlist_free(sl2);
+}
+
+static void
test_util_mempool(void)
{
mp_pool_t *pool;
@@ -3850,6 +3947,7 @@
SUBENT(util, datadir),
SUBENT(util, smartlist),
SUBENT(util, bitarray),
+ SUBENT(util, digestset),
SUBENT(util, mempool),
SUBENT(util, memarea),
SUBENT(util, strmap),
@@ -3960,6 +4058,11 @@
return 0;
}
+ if (0) {
+ bench_dmap();
+ return 0;
+ }
+
atexit(remove_directory);
printf("Running Tor unit tests on %s\n", get_uname());