[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[or-cvs] [tor/master 2/2] More gracefully handle corrupt state files.



Author: Mike Perry <mikeperry-git@xxxxxxxxxx>
Date: Tue, 6 Jul 2010 12:08:13 -0700
Subject: More gracefully handle corrupt state files.
Commit: a9edb0b4f67825a11647c8faefa613d704be44ae

Save a backup if we get odd circuitbuildtimes and other state info.

In the case of circuit build times, we no longer assert, and reset our state.
---
 changes/cbt-bugfixes  |    2 +
 src/or/circuitbuild.c |   27 ++++++++++++++++---
 src/or/config.c       |   71 ++++++++++++++++++++++++++++++------------------
 3 files changed, 69 insertions(+), 31 deletions(-)

diff --git a/changes/cbt-bugfixes b/changes/cbt-bugfixes
index 31241c9..12e62a8 100644
--- a/changes/cbt-bugfixes
+++ b/changes/cbt-bugfixes
@@ -29,5 +29,7 @@
      parameter and via a LearnCircuitBuildTimeout config option. Also
      automatically disable circuit build time calculation if we are either
      a AuthoritativeDirectory, or if we fail to write our state file. Bug 1296.
+   - More gracefully handle corrupt state files, removing asserts in favor
+     of saving a backup and resetting state.
 
 
diff --git a/src/or/circuitbuild.c b/src/or/circuitbuild.c
index 07a66cf..4090054 100644
--- a/src/or/circuitbuild.c
+++ b/src/or/circuitbuild.c
@@ -193,12 +193,11 @@ circuit_build_times_new_consensus_params(circuit_build_times_t *cbt,
   int32_t num = networkstatus_get_param(ns, "cbtrecentcount",
                    CBT_DEFAULT_RECENT_CIRCUITS);
 
-  if (num != cbt->liveness.num_recent_circs) {
+  if (num > 0 && num != cbt->liveness.num_recent_circs) {
     int8_t *recent_circs;
     log_notice(LD_CIRC, "Changing recent timeout size from %d to %d",
                cbt->liveness.num_recent_circs, num);
 
-    tor_assert(num > 0);
     tor_assert(cbt->liveness.timeouts_after_firsthop);
 
     /*
@@ -675,6 +674,10 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt,
             "Corrupt state file? Build times count mismatch. "
             "Read %d times, but file says %d", loaded_cnt,
             state->TotalBuildTimes);
+    *msg = tor_strdup("Build times count mismatch.");
+    circuit_build_times_reset(cbt);
+    tor_free(loaded_times);
+    return -1;
   }
 
   circuit_build_times_shuffle_and_store_array(cbt, loaded_times, loaded_cnt);
@@ -688,8 +691,18 @@ circuit_build_times_parse_state(circuit_build_times_t *cbt,
   log_info(LD_CIRC,
            "Loaded %d/%d values from %d lines in circuit time histogram",
            tot_values, cbt->total_build_times, N);
-  tor_assert(cbt->total_build_times == tot_values);
-  tor_assert(cbt->total_build_times <= CBT_NCIRCUITS_TO_OBSERVE);
+
+  if (cbt->total_build_times != tot_values
+        || cbt->total_build_times > CBT_NCIRCUITS_TO_OBSERVE) {
+    log_warn(LD_CIRC,
+            "Corrupt state file? Shuffled build times mismatch. "
+            "Read %d times, but file says %d", tot_values,
+            state->TotalBuildTimes);
+    *msg = tor_strdup("Build times count mismatch.");
+    circuit_build_times_reset(cbt);
+    tor_free(loaded_times);
+    return -1;
+  }
 
   circuit_build_times_set_timeout(cbt);
 
@@ -742,6 +755,12 @@ circuit_build_times_update_alpha(circuit_build_times_t *cbt)
     n++;
   }
 
+  /*
+   * We are erring and asserting here because this can only happen
+   * in codepaths other than startup. The startup state parsing code
+   * performs this same check, and resets state if it hits it. If we
+   * hit it at runtime, something serious has gone wrong.
+   */
   if (n!=cbt->total_build_times) {
     log_err(LD_CIRC, "Discrepancy in build times count: %d vs %d", n,
             cbt->total_build_times);
diff --git a/src/or/config.c b/src/or/config.c
index 771fa03..7dee8ff 100644
--- a/src/or/config.c
+++ b/src/or/config.c
@@ -4837,25 +4837,63 @@ or_state_validate(or_state_t *old_state, or_state_t *state,
 }
 
 /** Replace the current persistent state with <b>new_state</b> */
-static void
+static int
 or_state_set(or_state_t *new_state)
 {
   char *err = NULL;
+  int ret = 0;
   tor_assert(new_state);
   config_free(&state_format, global_state);
   global_state = new_state;
   if (entry_guards_parse_state(global_state, 1, &err)<0) {
     log_warn(LD_GENERAL,"%s",err);
     tor_free(err);
+    ret = -1;
   }
   if (rep_hist_load_state(global_state, &err)<0) {
     log_warn(LD_GENERAL,"Unparseable bandwidth history state: %s",err);
     tor_free(err);
+    ret = -1;
   }
   if (circuit_build_times_parse_state(&circ_times, global_state, &err) < 0) {
     log_warn(LD_GENERAL,"%s",err);
     tor_free(err);
+    ret = -1;
   }
+  return ret;
+}
+
+/**
+ * Save a broken state file to a backup location.
+ */
+static void
+or_state_save_broken(char *fname)
+{
+  int i;
+  file_status_t status;
+  size_t len = strlen(fname)+16;
+  char *fname2 = tor_malloc(len);
+  for (i = 0; i < 100; ++i) {
+    tor_snprintf(fname2, len, "%s.%d", fname, i);
+    status = file_status(fname2);
+    if (status == FN_NOENT)
+      break;
+  }
+  if (i == 100) {
+    log_warn(LD_BUG, "Unable to parse state in \"%s\"; too many saved bad "
+             "state files to move aside. Discarding the old state file.",
+             fname);
+    unlink(fname);
+  } else {
+    log_warn(LD_BUG, "Unable to parse state in \"%s\". Moving it aside "
+             "to \"%s\".  This could be a bug in Tor; please tell "
+             "the developers.", fname, fname2);
+    if (rename(fname, fname2) < 0) {
+      log_warn(LD_BUG, "Weirdly, I couldn't even move the state aside. The "
+               "OS gave an error of %s", strerror(errno));
+    }
+  }
+  tor_free(fname2);
 }
 
 /** Reload the persistent state from disk, generating a new state as needed.
@@ -4917,31 +4955,8 @@ or_state_load(void)
              " This is a bug in Tor.");
     goto done;
   } else if (badstate && contents) {
-    int i;
-    file_status_t status;
-    size_t len = strlen(fname)+16;
-    char *fname2 = tor_malloc(len);
-    for (i = 0; i < 100; ++i) {
-      tor_snprintf(fname2, len, "%s.%d", fname, i);
-      status = file_status(fname2);
-      if (status == FN_NOENT)
-        break;
-    }
-    if (i == 100) {
-      log_warn(LD_BUG, "Unable to parse state in \"%s\"; too many saved bad "
-               "state files to move aside. Discarding the old state file.",
-               fname);
-      unlink(fname);
-    } else {
-      log_warn(LD_BUG, "Unable to parse state in \"%s\". Moving it aside "
-               "to \"%s\".  This could be a bug in Tor; please tell "
-               "the developers.", fname, fname2);
-      if (rename(fname, fname2) < 0) {
-        log_warn(LD_BUG, "Weirdly, I couldn't even move the state aside. The "
-                 "OS gave an error of %s", strerror(errno));
-      }
-    }
-    tor_free(fname2);
+    or_state_save_broken(fname);
+
     tor_free(contents);
     config_free(&state_format, new_state);
 
@@ -4953,7 +4968,9 @@ or_state_load(void)
   } else {
     log_info(LD_GENERAL, "Initialized state");
   }
-  or_state_set(new_state);
+  if (or_state_set(new_state) == -1) {
+    or_state_save_broken(fname);
+  }
   new_state = NULL;
   if (!contents) {
     global_state->next_write = 0;
-- 
1.7.1