[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r18551: {torflow} Fix nasty bug with node updates that was silently destroying (torflow/trunk/NetworkScanners)
Author: mikeperry
Date: 2009-02-15 19:32:42 -0500 (Sun, 15 Feb 2009)
New Revision: 18551
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
Log:
Fix nasty bug with node updates that was silently destroying
scan progress. Related: Make the test runs a bit more forviging
of some inconclusive results when deciding to mark a node.
Also, Store soupdiffs and jsdiffs to disk with the results,
so we can accumulate differences, and always refetch SSL
certs.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-02-15 19:13:01 UTC (rev 18550)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-02-16 00:32:42 UTC (rev 18551)
@@ -78,7 +78,14 @@
self.verbose=0
self.from_rescan = False
self.filename=None
+ self._pickle_revision = 1
+ def depickle_upgrade(self):
+ if not "_pickle_revision" in self.__dict__: # upgrade to v0
+ self._pickle_revision = 0
+ if self._pickle_revision < 1:
+ self._pickle_revision = 1
+
def _rebase(self, filename, new_data_root):
if not filename: return filename
filename = os.path.normpath(filename)
@@ -150,6 +157,7 @@
ret = TestResult.__str__(self)
ssl_file = open(self.ssl_file, 'r')
ssl_domain = pickle.load(ssl_file)
+ ssl_domain.depickle_upgrade()
ssl_file.close()
ret += " Rotates: "+str(ssl_domain.cert_rotates)
ret += " Changed: "+str(ssl_domain.cert_changed)+"\n"
@@ -175,11 +183,14 @@
self.cert_rotates = False
self.cert_changed = False
+ def depickle_upgrade(self):
+ pass
+
def add_cert(self, ip, cert_string):
if ip in self.ip_map and self.ip_map[ip] != cert_string:
plog("NOTICE", self.domain+" has changed certs.")
self.cert_changed = True
- elif len(self.cert_map) and cert_string not in self.cert_map:
+ if len(self.cert_map) and cert_string not in self.cert_map:
plog("NOTICE", self.domain+" is rotating certs.")
self.cert_rotates = True
self.cert_map[cert_string] = ip
@@ -255,23 +266,32 @@
class JsTestResult(TestResult):
''' Represents the result of a JS test '''
def __init__(self, exit_node, website, status, reason=None,
- content=None, content_exit=None, content_old=None):
+ content=None, content_exit=None, content_old=None,
+ jsdiffer=None):
super(JsTestResult, self).__init__(exit_node, website, status, reason)
self.proto = "http"
self.content = content
self.content_exit = content_exit
self.content_old = content_old
+ self.jsdiffer = jsdiffer
+ def depickle_upgrade(self):
+ if not "_pickle_revision" in self.__dict__ or self._pickle_revision < 1:
+ self.jsdiffer = None
+ TestResult.depickle_upgrade(self)
+
def rebase(self, new_data_root):
self.content = self._rebase(self.content, new_data_root)
self.content_exit = self._rebase(self.content_exit, new_data_root)
self.content_old = self._rebase(self.content_old, new_data_root)
+ self.jsdiffer = self._rebase(self.jsdiffer, new_data_root)
def mark_false_positive(self, reason):
TestResult.mark_false_positive(self, reason)
self.content=self.move_file(self.content, http_falsepositive_dir)
self.content_old=self.move_file(self.content_old, http_falsepositive_dir)
self.content_exit=self.move_file(self.content_exit,http_falsepositive_dir)
+ self.jsdiffer=self.move_file(self.jsdiffer,http_falsepositive_dir)
def remove_files(self):
try: os.unlink(self.content)
@@ -310,23 +330,36 @@
class HtmlTestResult(TestResult):
''' Represents the result of a http test '''
def __init__(self, exit_node, website, status, reason=None,
- content=None, content_exit=None, content_old=None):
+ content=None, content_exit=None, content_old=None,
+ soupdiffer=None, jsdiffer=None):
super(HtmlTestResult, self).__init__(exit_node, website, status, reason)
self.proto = "http"
self.content = content
self.content_exit = content_exit
self.content_old = content_old
+ self.soupdiffer = soupdiffer
+ self.jsdiffer = jsdiffer
+ def depickle_upgrade(self):
+ if not "_pickle_revision" in self.__dict__ or self._pickle_revision < 1:
+ self.soupdiffer = None
+ self.jsdiffer = None
+ TestResult.depickle_upgrade(self)
+
def rebase(self, new_data_root):
self.content = self._rebase(self.content, new_data_root)
self.content_exit = self._rebase(self.content_exit, new_data_root)
self.content_old = self._rebase(self.content_old, new_data_root)
+ self.soupdiffer = self._rebase(self.soupdiffer, new_data_root)
+ self.jsdiffer = self._rebase(self.jsdiffer, new_data_root)
def mark_false_positive(self, reason):
TestResult.mark_false_positive(self, reason)
self.content=self.move_file(self.content,http_falsepositive_dir)
self.content_old=self.move_file(self.content_old, http_falsepositive_dir)
self.content_exit=self.move_file(self.content_exit,http_falsepositive_dir)
+ self.soupdiffer=self.move_file(self.soupdiffer,http_falsepositive_dir)
+ self.jsdiffer=self.move_file(self.jsdiffer,http_falsepositive_dir)
def remove_files(self):
try: os.unlink(self.content)
@@ -370,7 +403,11 @@
ret+=line+"\n"
if soup and tor_soup and old_soup:
- soupdiff = SoupDiffer(old_soup, soup)
+ if self.soupdiffer and os.path.exists(self.soupdiffer):
+ soupdiff = pickle.load(open(self.soupdiffer, 'r'))
+ soupdiff.depickle_upgrade()
+ else:
+ soupdiff = SoupDiffer(old_soup, soup)
more_tags = soupdiff.show_changed_tags(tor_soup)
more_attrs = soupdiff.show_changed_attrs(tor_soup)
@@ -518,13 +555,16 @@
if f.endswith('.result'):
fh = open(os.path.join(root, f))
result = pickle.load(fh)
+ result.depickle_upgrade()
result.rebase(self.data_dir)
results.append(result)
return results
def getResult(self, file):
fh = open(file, 'r')
- return pickle.load(fh)
+ res = pickle.load(fh)
+ res.depickle_upgrade()
+ return res
def uniqueFilename(afile):
(prefix,suffix)=os.path.splitext(afile)
@@ -592,8 +632,8 @@
test_file = open(filename+"."+str(position)+".test", 'r')
test = pickle.load(test_file)
+ test.depickle_upgrade()
test_file.close()
- test.depickle_upgrade()
return test
def saveTest(self, test):
@@ -670,10 +710,10 @@
cntnt_old = self._get_content(soup_old)
self.content_pool = cntnt_new | cntnt_old
self.content_changed = bool(cntnt_new ^ cntnt_old)
-
- def rebase(self, new_dir):
+ self._pickle_revision = 0
+
+ def depickle_upgrade(self):
pass
- # XXX
def _get_tags(self, soup):
return sets.Set(map(str,
@@ -735,11 +775,11 @@
ret = ""
for tags in map(TheChosenSoup, new_tags):
for t in tags.findAll():
- if t.name not in self.changed_attr_map:
+ if t.name not in self.changed_tag_map:
ret += " New Tag: "+str(t)+"\n"
else:
for attr in t.attrs:
- if attr[0] not in self.changed_attr_map[t.name] \
+ if attr[0] not in self.changed_tag_map[t.name] \
and attr[0] in attrs_to_check_map:
ret += " New Attr "+attr[0]+": "+str(t)+"\n"
return ret
@@ -767,8 +807,12 @@
class JSDiffer:
def __init__(self, js_string):
+ self._pickle_revision = 0
if HAVE_PYPY: self.ast_cnts = self._count_ast_elements(js_string)
+ def depickle_upgrade(self):
+ pass
+
def _ast_recursive_worker(ast, ast_cnts):
if not ast.symbol in ast_cnts:
ast_cnts[ast.symbol] = 1
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-02-15 19:13:01 UTC (rev 18550)
+++ torflow/trunk/NetworkScanners/soat.py 2009-02-16 00:32:42 UTC (rev 18551)
@@ -174,7 +174,7 @@
self.min_targets = min_targets
self.filename = None
self.rescan_nodes = sets.Set([])
- self.nodes = []
+ self.nodes = sets.Set([])
self.node_map = {}
self.banned_targets = sets.Set([])
self.total_nodes = 0
@@ -182,7 +182,7 @@
self.nodes_to_mark = 0
self.tests_per_node = num_tests_per_node
self._reset()
- self._pickle_revision = 1 # Will increment as fields are added
+ self._pickle_revision = 2 # Will increment as fields are added
def run_test(self):
raise NotImplemented()
@@ -201,6 +201,8 @@
if type(self.successes[addr]) == int:
self.successes[addr] = sets.Set(xrange(0,self.successes[addr]))
plog("INFO", "Upgraded "+self.__class__.__name__+" to v1")
+ if self._pickle_revision < 2:
+ self._pickle_revision = 2
def refill_targets(self):
if len(self.targets) < self.min_targets:
@@ -218,7 +220,7 @@
if target in self.successes: del self.successes[target]
if target in self.exit_fails: del self.exit_fails[target]
kill_results = []
- for r in self.results:
+ for r in self.results:
if r.site == target:
kill_results.append(r)
for r in kill_results:
@@ -261,12 +263,18 @@
return random.choice(list(self.nodes))
def update_nodes(self):
+ all_old_nodes = sets.Set(self.node_map.keys())
nodes = metacon.node_manager.get_nodes_for_port(self.port)
self.node_map = {}
for n in nodes:
self.node_map[n.idhex] = n
self.total_nodes = len(nodes)
- self.nodes = sets.Set(map(lambda n: n.idhex, nodes))
+ all_new_nodes = sets.Set(map(lambda n: n.idhex, nodes))
+ marked_nodes = sets.Set(self.node_results.keys())
+ new_nodes = all_new_nodes - all_old_nodes
+ new_nodes -= marked_nodes
+ self.nodes &= all_new_nodes # Clear down nodes
+ self.nodes = self.nodes | new_nodes # add new ones
# Only scan the stuff loaded from the rescan
if self.rescan_nodes: self.nodes &= self.rescan_nodes
if not self.nodes:
@@ -278,11 +286,16 @@
exit_node = metacon.get_exit_node()[1:]
if exit_node != node:
plog("ERROR", "Asked to mark a node that is not current: "+node+" vs "+exit_node)
+ plog("INFO", "Marking "+node+" with result "+str(result))
self.nodes_marked += 1
if not node in self.node_results: self.node_results[node] = []
self.node_results[node].append(result)
if len(self.node_results[node]) >= self.tests_per_node:
self.nodes.remove(node)
+ plog("INFO", "Removed node "+node+". "+str(len(self.nodes))+" nodes remain")
+ else:
+ plog("DEBUG", "Keeping node "+node+". "+str(len(self.nodes))+" nodes remain. Tests: "+str(len(self.node_results[node]))+"/"+str(self.tests_per_node))
+
def finished(self):
return not self.nodes
@@ -531,8 +544,7 @@
self.tor_cookie_jar = cookielib.MozillaCookieJar()
self.cookie_jar = cookielib.MozillaCookieJar()
self.headers = copy.copy(firefox_headers)
-
- ret_result = TEST_SUCCESS
+
self.tests_run += 1
n_tests = random.choice(xrange(1,len(self.targets)+1))
@@ -540,20 +552,24 @@
plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+ n_success = n_fail = n_inconclusive = 0
for ftype in filetypes:
# FIXME: Set referrer to random or none for each of these
address = random.choice(self.targets[ftype])
result = self.check_http(address)
- if result > ret_result:
- ret_result = result
- result = self.check_cookies()
- if result > ret_result:
- ret_result = result
+ if result == TEST_INCONCLUSIVE: n_inconclusive += 1
+ if result == TEST_FAILURE: n_fail += 1
+ if result == TEST_SUCCESS: n_success += 1
# Cookie jars contain locks and can't be pickled. Clear them away.
self.tor_cookie_jar = None
self.cookie_jar = None
- return ret_result
+
+ if n_fail: return TEST_FAILURE
+ elif n_inconclusive > 2*n_success: # > 66% inconclusive -> redo
+ return TEST_INCONCLUSIVE
+ else:
+ return TEST_SUCCESS
def _remove_target_addr(self, target):
for ftype in self.targets:
@@ -896,6 +912,18 @@
self.proto = "HTML"
self.recurse_filetypes = recurse_filetypes
self.fetch_queue = []
+
+ def _reset(self):
+ HTTPTest._reset(self)
+ self.targets = [] # FIXME: Lame..
+ self.soupdiffer_files = {}
+ self.jsdiffer_files = {}
+
+ def depickle_upgrade(self):
+ if self._pickle_revision < 2:
+ self.soupdiffer_files = {}
+ self.jsdiffer_files = {}
+ SearchBasedTest.depickle_upgrade(self)
def run_test(self):
# A single test should have a single cookie jar
@@ -903,22 +931,24 @@
self.cookie_jar = cookielib.MozillaCookieJar()
self.headers = copy.copy(firefox_headers)
+ use_referers = False
first_referer = None
if random.randint(1,100) < referer_chance_pct:
+ use_referers = True
# FIXME: Hrmm.. May want to do this a bit better..
first_referer = random.choice(self.targets)
plog("INFO", "Chose random referer "+first_referer)
- ret_result = TEST_SUCCESS
self.tests_run += 1
# TODO: Watch for spider-traps! (ie mutually sourcing iframes)
# Keep a trail log for this test and check for loops
address = random.choice(self.targets)
self.fetch_queue.append(("html", address, first_referer))
+ n_success = n_fail = n_inconclusive = 0
while self.fetch_queue:
(test, url, referer) = self.fetch_queue.pop(0)
- if referer: self.headers['Referer'] = referer
+ if use_referers and referer: self.headers['Referer'] = referer
# Technically both html and js tests check and dispatch via mime types
# but I want to know when link tags lie
if test == "html" or test == "http": result = self.check_html(url)
@@ -926,21 +956,26 @@
else:
plog("WARN", "Unknown test type: "+test+" for "+url)
result = TEST_SUCCESS
- if result > ret_result:
- ret_result = result
- result = self.check_cookies()
- if result > ret_result:
- ret_result = result
+ if result == TEST_INCONCLUSIVE: n_inconclusive += 1
+ if result == TEST_FAILURE: n_fail += 1
+ if result == TEST_SUCCESS: n_success += 1
# Need to clear because the cookiejars use locks...
self.tor_cookie_jar = None
self.cookie_jar = None
- return ret_result
+ if n_fail: return TEST_FAILURE
+ elif 2*n_inconclusive > n_success: # > 33% inconclusive -> redo
+ return TEST_INCONCLUSIVE
+ else:
+ return TEST_SUCCESS
+
# FIXME: This is pretty lame.. We should change how
# the HTTPTest stores URLs so we don't have to do this.
def _remove_target_addr(self, target):
Test._remove_target_addr(self, target)
+ if target in self.soupdiffer_files: del self.soupdiffer_files[target]
+ if target in self.jsdiffer_files: del self.jsdiffer_files[target]
def refill_targets(self):
Test.refill_targets(self)
@@ -965,9 +1000,9 @@
elif t.name in recurse_script:
if t.name == "link":
for a in t.attrs:
- if a[0] == "type" and a[1] in script_mime_types:
- plog("INFO", "Adding link script for: "+str(t))
- targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
+ #if a[0] == "type" and a[1] in script_mime_types:
+ plog("INFO", "Adding link script for: "+str(t))
+ targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
else:
plog("INFO", "Adding script tag for: "+str(t))
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
@@ -997,19 +1032,50 @@
if type(ret) == int:
return ret
return self._check_js_worker(address, ret)
+
+ def is_html(self, mime_type, content):
+ is_html = False
+ for type_match in html_mime_types:
+ if re.match(type_match, mime_type):
+ is_html = True
+ break
+ return is_html
+
+ def is_script(self, mime_type, content):
+ is_script = False
+ for type_match in script_mime_types:
+ if re.match(type_match, mime_type):
+ is_script = True
+ break
+ return is_script
def _check_js_worker(self, address, http_ret):
(mime_type, tor_js, tsha, orig_js, osha, new_js, nsha, exit_node) = http_ret
- if mime_type not in script_mime_types:
+ if not self.is_script(mime_type, orig_js):
plog("WARN", "Non-script mime type "+mime_type+" fed to JS test for "+address)
- if mime_type in html_mime_types:
+
+ if self.is_html(mime_type, orig_js):
return self._check_html_worker(address, http_ret)
else:
return self._check_http_worker(address, http_ret)
-
- jsdiff = JSDiffer(orig_js)
+
+ address_file = DataHandler.safeFilename(address[7:])
+ content_prefix = http_content_dir+address_file
+ failed_prefix = http_failed_dir+address_file
+
+ if address in self.jsdiffer_files:
+ plog("DEBUG", "Loading jsdiff for "+address)
+ jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+ jsdiff.depickle_upgrade()
+ else:
+ plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
+ jsdiff = JSDiffer(orig_js)
+ self.jsdiffer_files[address] = content_prefix+".jsdiff"
+
jsdiff.prune_differences(new_js)
+ pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+
has_js_changes = jsdiff.contains_differences(tor_js)
if not has_js_changes:
@@ -1020,10 +1086,6 @@
self.register_success(address, exit_node)
return TEST_SUCCESS
else:
- address_file = DataHandler.safeFilename(address[7:])
- content_prefix = http_content_dir+address_file
- failed_prefix = http_failed_dir+address_file
-
exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.dyn-content'), 'w')
exit_content_file.write(tor_js)
exit_content_file.close()
@@ -1031,7 +1093,8 @@
result = JsTestResult(exit_node, address, TEST_FAILURE,
FAILURE_DYNAMIC, content_prefix+".content",
exit_content_file.name,
- content_prefix+'.content-old')
+ content_prefix+'.content-old',
+ self.jsdiffer_files[address])
if self.rescan_nodes: result.from_rescan = True
self.results.append(result)
datahandler.saveResult(result)
@@ -1051,10 +1114,10 @@
def _check_html_worker(self, address, http_ret):
(mime_type,tor_html,tsha,orig_html,osha,new_html,nsha,exit_node)=http_ret
- if mime_type not in html_mime_types:
+ if not self.is_html(mime_type, orig_html):
# XXX: Keep an eye on this logline.
- plog("INFO", "Non-html mime type "+mime_type+" fed to HTML test for "+address)
- if mime_type in script_mime_types:
+ plog("WARN", "Non-html mime type "+mime_type+" fed to HTML test for "+address)
+ if self.is_script(mime_type, orig_html):
return self._check_js_worker(address, http_ret)
else:
return self._check_http_worker(address, http_ret)
@@ -1120,7 +1183,17 @@
# 3. Compare list of changed tags for tor vs new and
# see if any extra tags changed or if new attributes
# were added to additional tags
- soupdiff = SoupDiffer(orig_soup, new_soup)
+ if address in self.soupdiffer_files:
+ plog("DEBUG", "Loading soupdiff for "+address)
+ soupdiff = pickle.load(open(self.soupdiffer_files[address], 'r'))
+ soupdiff.depickle_upgrade()
+ soupdiff.prune_differences(new_soup)
+ else:
+ plog("DEBUG", "No soupdiff for "+address+". Creating+dumping")
+ soupdiff = SoupDiffer(orig_soup, new_soup)
+ self.soupdiffer_files[address] = content_prefix+".soupdiff"
+
+ pickle.dump(soupdiff, open(self.soupdiffer_files[address], 'w'))
more_tags = soupdiff.show_changed_tags(tor_soup)
more_attrs = soupdiff.show_changed_attrs(tor_soup)
@@ -1139,8 +1212,18 @@
false_positive = True
if false_positive:
- jsdiff = JSSoupDiffer(orig_soup)
+ if address in self.jsdiffer_files:
+ plog("DEBUG", "Loading jsdiff for "+address)
+ jsdiff = pickle.load(open(self.jsdiffer_files[address], 'r'))
+ jsdiff.depickle_upgrade()
+ else:
+ plog("DEBUG", "No jsdiff for "+address+". Creating+dumping")
+ jsdiff = JSSoupDiffer(orig_soup)
+ self.jsdiffer_files[address] = content_prefix+".jsdiff"
+
jsdiff.prune_differences(new_soup)
+ pickle.dump(jsdiff, open(self.jsdiffer_files[address], 'w'))
+
differences = jsdiff.show_differences(tor_soup)
false_positive = not differences
plog("INFO", "JSSoupDiffer predicts false_positive="+str(false_positive))
@@ -1159,11 +1242,19 @@
exit_content_file = open(DataHandler.uniqueFilename(failed_prefix+'.'+exit_node[1:]+'.dyn-content'),'w')
exit_content_file.write(tor_html)
exit_content_file.close()
+
+ if address in self.jsdiffer_files:
+ jsdiff_file = self.jsdiffer_files[address]
+ else: jsdiff_file = None
+ if address in self.soupdiffer_files:
+ soupdiff_file = self.soupdiffer_files[address]
+ else: soupdiff_file = None
result = HtmlTestResult(exit_node, address, TEST_FAILURE,
FAILURE_DYNAMIC, content_prefix+".content",
exit_content_file.name,
- content_prefix+'.content-old')
+ content_prefix+'.content-old',
+ soupdiff_file, jsdiff_file)
if self.rescan_nodes: result.from_rescan = True
self.results.append(result)
datahandler.saveResult(result)
@@ -1242,15 +1333,16 @@
def _update_cert_list(self, ssl_domain, check_ips):
changed = False
for ip in check_ips:
- if not ssl_domain.seen_ip(ip):
- plog('INFO', 'Ssl connection to new ip '+ip+" for "+ssl_domain.domain)
- raw_cert = self.ssl_request(ip)
- if not raw_cert or isinstance(raw_cert, Exception):
- plog('WARN', 'Error getting the correct cert for '+ssl_domain.domain+":"+ip)
- continue
- ssl_domain.add_cert(ip,
- crypto.dump_certificate(crypto.FILETYPE_PEM, raw_cert))
- changed = True
+ #let's always check.
+ #if not ssl_domain.seen_ip(ip):
+ plog('INFO', 'Ssl connection to new ip '+ip+" for "+ssl_domain.domain)
+ raw_cert = self.ssl_request(ip)
+ if not raw_cert or isinstance(raw_cert, Exception):
+ plog('WARN', 'Error getting the correct cert for '+ssl_domain.domain+":"+ip)
+ continue
+ ssl_domain.add_cert(ip,
+ crypto.dump_certificate(crypto.FILETYPE_PEM, raw_cert))
+ changed = True # Always save new copy.
return changed
def check_openssl(self, address):
@@ -1266,6 +1358,7 @@
try:
ssl_file = open(ssl_file_name, 'r')
ssl_domain = pickle.load(ssl_file)
+ ssl_domain.depickle_upgrade()
ssl_file.close()
except IOError:
ssl_domain = SSLDomain(address)