[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r18484: {torflow} Fix a bug with the HTML false positive filter where we weren (torflow/trunk/NetworkScanners)
Author: mikeperry
Date: 2009-02-11 06:27:48 -0500 (Wed, 11 Feb 2009)
New Revision: 18484
Modified:
torflow/trunk/NetworkScanners/README.ExitScanning
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
torflow/trunk/NetworkScanners/soatstats.py
Log:
Fix a bug with the HTML false positive filter where we
weren't properly tracking all changed tags. Also fix rewind
behavior to properly clear accumulated URL error information.
Modified: torflow/trunk/NetworkScanners/README.ExitScanning
===================================================================
--- torflow/trunk/NetworkScanners/README.ExitScanning 2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/README.ExitScanning 2009-02-11 11:27:48 UTC (rev 18484)
@@ -54,7 +54,7 @@
The patch to fix this bug is present in ../tordiffs/XXX.
It is also strongly recommended that you have a custom Tor instance that
-it devoted only to exit scanning, and is not performing any other
+is devoted only to exit scanning, and is not performing any other
function (including serving as a relay or a directory authority).
@@ -91,6 +91,9 @@
urls. This can be useful if you believe it likely for an adversary to
target only certain keywords/concepts/sites in a particular context.
+You can edit the contents of the wordlist files while SoaT runs. It will
+pick up the changes after it completes a full network scan with the old
+list.
IV. Running Tor, The Metatroller, and SoaT
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-02-11 11:27:48 UTC (rev 18484)
@@ -10,6 +10,7 @@
import traceback
import difflib
import re
+import copy
sys.path.append("./libs")
from BeautifulSoup.BeautifulSoup import Tag, SoupStrainer
@@ -317,6 +318,7 @@
def __str__(self):
ret = TestResult.__str__(self)
if self.verbose:
+ soup = old_soup = tor_soup = None
if self.content and self.content_old:
content = open(self.content).read().decode('ascii', 'ignore')
content_old = open(self.content_old).read().decode('ascii', 'ignore')
@@ -339,6 +341,30 @@
lineterm="")
for line in diff:
ret+=line+"\n"
+
+ if soup and tor_soup and old_soup:
+ old_vs_new = SoupDiffer(old_soup, soup)
+ new_vs_old = SoupDiffer(soup, old_soup)
+ new_vs_tor = SoupDiffer(soup, tor_soup)
+
+ # I'm an evil man and I'm going to CPU hell..
+ changed_tags = SoupDiffer.merge_tag_maps(
+ old_vs_new.changed_tags_with_attrs(),
+ new_vs_old.changed_tags_with_attrs())
+
+ changed_attributes = SoupDiffer.merge_tag_maps(
+ old_vs_new.changed_attributes_by_tag(),
+ new_vs_old.changed_attributes_by_tag())
+
+ changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+
+ ret += "\nTor changed tags:\n"
+ ret += new_vs_tor.more_changed_tags(changed_tags)
+ ret += "\nTor changed attrs:\n"
+ ret += new_vs_tor.more_changed_attrs(changed_attributes)
+ if not changed_content:
+ ret += "\nChanged Content:\n"
+ ret += "\n".join(new_vs_tor.changed_content())+"\n"
else:
if self.content:
ret += " "+self.content+"\n"
@@ -595,6 +621,7 @@
changed_tags[t.name].add(attr[0])
return changed_tags
+
def has_more_changed_tags(self, tag_attr_map):
""" Returns true if we have additional tags with additional
attributes that were not present in tag_attr_map
@@ -609,6 +636,18 @@
return True
return False
+ def more_changed_tags(self, tag_attr_map):
+ ret = ""
+ for tags in map(TheChosenSoup, self.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in tag_attr_map:
+ ret += " New Tag: "+str(t)+"\n"
+ else:
+ for attr in t.attrs:
+ if attr[0] not in tag_attr_map[t.name]:
+ ret += " New Attr "+attr[0]+": "+str(t)+"\n"
+ return ret
+
def _get_attributes(self):
attrs_old = [(tag.name, tag.attrs) for tag in self.soup_old.findAll()]
attrs_new = [(tag.name, tag.attrs) for tag in self.soup_new.findAll()]
@@ -640,6 +679,17 @@
changed_attributes[tag].add(attr[0])
return changed_attributes
+ def merge_tag_maps(tag_map1, tag_map2):
+ " Merges either two tag_attr_maps or two attrs_by_tag maps "
+ ret = copy.deepcopy(tag_map1)
+ for tag in tag_map2:
+ if tag not in ret:
+ ret[tag] = copy.deepcopy(tag_map2[tag])
+ else:
+ ret[tag].union_update(tag_map2[tag])
+ return ret
+ merge_tag_maps = Callable(merge_tag_maps)
+
def has_more_changed_attrs(self, attrs_by_tag):
""" Returns true if we have any tags with additional
changed attributes that were not present in attrs_by_tag
@@ -652,6 +702,17 @@
return True
return False
+ def more_changed_attrs(self, attrs_by_tag):
+ ret = ""
+ for (tag, attr) in self.changed_attributes():
+ if tag in attrs_by_tag:
+ if attr[0] not in attrs_by_tag[tag]:
+ ret += " New Attr "+attr[0]+": "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
+ else:
+ ret += " New Tag: "+tag+" "+attr[0]+'="'+attr[1]+'"\n'
+ return ret
+
+
def changed_content(self):
""" Return a list of tag contents changed in soup_new """
tags_old = sets.Set(map(str,
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/soat.py 2009-02-11 11:27:48 UTC (rev 18484)
@@ -152,7 +152,6 @@
traceback.print_exc()
return (666, [], "", str(e))
- # TODO: Consider also returning mime type here
return (reply.code, new_cookies, mime_type, content)
class Test:
@@ -163,13 +162,13 @@
self.mt = mt
self.datahandler = DataHandler()
self.min_targets = min_targets
+ self.exit_limit_pct = max_exit_fail_pct
+ self.dynamic_limit = max_dynamic_failure
self.marked_nodes = sets.Set([])
self.exit_fails = {}
self.successes = {}
- self.exit_limit_pct = max_exit_fail_pct
self.results = []
self.dynamic_fails = {}
- self.dynamic_limit = max_dynamic_failure
self.banned_targets = sets.Set([])
def run_test(self):
@@ -239,7 +238,13 @@
self.tests_run = 0
self.nodes_marked = 0
self.marked_nodes = sets.Set([])
+ self.exit_fails = {}
+ self.successes = {}
+ self.dynamic_fails = {}
+ # TODO: report these results as BadExit before clearing
+ self.results = []
+
def register_exit_failure(self, address, exit_node):
if address in self.exit_fails:
self.exit_fails[address].add(exit_node)
@@ -277,10 +282,14 @@
class SearchBasedTest(Test):
- def __init__(self, mt, proto, port, wordlist):
- self.wordlist = wordlist
+ def __init__(self, mt, proto, port, wordlist_file):
+ self.wordlist_file = wordlist_file
Test.__init__(self, mt, proto, port)
+ def rewind(self):
+ self.wordlist = load_wordlist(self.wordlist_file)
+ Test.rewind(self)
+
def _is_useable_url(self, url, valid_schemes=None, filetypes=None):
(scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)
if netloc.rfind(":") != -1:
@@ -399,10 +408,14 @@
self.httpcode_limit_pct = max_exit_httpcode_pct
self.scan_filetypes = filetypes
+ def rewind(self):
+ SearchBasedTest.rewind(self)
+ self.httpcode_fails = {}
+
def check_cookies(self):
tor_cookies = "\n"
plain_cookies = "\n"
- # XXX: do we need to sort these?
+ # XXX: do we need to sort these? So far we have worse problems..
for cookie in self.tor_cookie_jar:
tor_cookies += "\t"+cookie.name+":"+cookie.domain+cookie.path+" discard="+str(cookie.discard)+"\n"
for cookie in self.cookie_jar:
@@ -415,7 +428,9 @@
tor_cookies)
self.results.append(result)
self.datahandler.saveResult(result)
- return TEST_FAILURE
+ # XXX: this test is pretty spammy with false positives..
+ # It should not affect if a node "passes" or not yet.
+ #return TEST_FAILURE
return TEST_SUCCESS
@@ -811,8 +826,10 @@
if t.name == "link":
for a in t.attrs:
if a[0] == "type" and a[1] in script_mime_types:
+ plog("INFO", "Adding link script for: "+str(t))
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
else:
+ plog("INFO", "Adding script tag for: "+str(t))
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
elif t.name == 'a':
if attr_name == "href":
@@ -845,7 +862,7 @@
(mime_type, tor_js, tsha, orig_js, osha, new_js, nsha, exit_node) = http_ret
if mime_type not in script_mime_types:
- plog("WARN", "Non-script mime type "+mime_type+" fed to JS test")
+ plog("WARN", "Non-script mime type "+mime_type+" fed to JS test for "+address)
if mime_type in html_mime_types:
return self._check_html_worker(address, http_ret)
else:
@@ -897,7 +914,7 @@
if mime_type not in html_mime_types:
# XXX: Keep an eye on this logline.
- plog("INFO", "Non-html mime type "+mime_type+" fed to HTML test")
+ plog("INFO", "Non-html mime type "+mime_type+" fed to HTML test for "+address)
if mime_type in script_mime_types:
return self._check_js_worker(address, http_ret)
else:
@@ -968,11 +985,13 @@
new_vs_tor = SoupDiffer(new_soup, tor_soup)
# I'm an evil man and I'm going to CPU hell..
- changed_tags = old_vs_new.changed_tags_with_attrs()
- changed_tags.update(new_vs_old.changed_tags_with_attrs())
+ changed_tags = SoupDiffer.merge_tag_maps(
+ old_vs_new.changed_tags_with_attrs(),
+ new_vs_old.changed_tags_with_attrs())
- changed_attributes = old_vs_new.changed_attributes_by_tag()
- changed_attributes.update(new_vs_old.changed_attributes_by_tag())
+ changed_attributes = SoupDiffer.merge_tag_maps(
+ old_vs_new.changed_attributes_by_tag(),
+ new_vs_old.changed_attributes_by_tag())
changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
@@ -1044,7 +1063,7 @@
c.set_connect_state()
try:
- c.connect((address, 443)) # XXX: Verify TorDNS here too..
+ c.connect((address, 443)) # DNS OK.
c.send(crypto.dump_certificate_request(crypto.FILETYPE_PEM,request))
except socket.error, e:
plog('WARN','An error occured while opening an ssl connection to '+address+": "+str(e))
@@ -1177,6 +1196,8 @@
if ssl_domain.seen_cert(cert_pem):
result = SSLTestResult(exit_node, address, ssl_file_name, TEST_SUCCESS)
#self.datahandler.saveResult(result)
+ if address in self.successes: self.successes[address]+=1
+ else: self.successes[address]=1
return TEST_SUCCESS
# False positive case.. Can't help it if the cert rotates AND we have a
@@ -2145,46 +2166,28 @@
tests = {}
if do_ssl:
- try:
- tests["SSL"] = SSLTest(mt, load_wordlist(ssl_wordlist_file))
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["SSL"] = SSLTest(mt, ssl_wordlist_file)
if do_http:
- try:
- tests["HTTP"] = HTTPTest(mt, load_wordlist(filetype_wordlist_file))
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["HTTP"] = HTTPTest(mt, filetype_wordlist_file)
if do_html:
- try:
- tests["HTML"] = HTMLTest(mt, load_wordlist(html_wordlist_file))
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["HTML"] = HTMLTest(mt, html_wordlist_file)
if do_smtp:
- try:
- tests["SMTPS"] = SMTPSTest(mt)
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["SMTPS"] = SMTPSTest(mt)
if do_pop:
- try:
- tests["POPS"] = POP3STest(mt)
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["POPS"] = POP3STest(mt)
if do_imap:
- try:
- tests["IMAPS"] = IMAPSTest(mt)
- except NoURLsFound, e:
- plog('ERROR', e.message)
+ tests["IMAPS"] = IMAPSTest(mt)
# maybe no tests could be initialized
if not (do_ssl or do_html or do_http or do_ssh or do_smtp or do_pop or do_imap):
plog('INFO', 'Done.')
sys.exit(0)
-
+
for test in tests.itervalues():
test.rewind()
@@ -2250,7 +2253,7 @@
for test in tests.itervalues():
if test.finished():
plog("NOTICE", test.proto+" test has finished all nodes. Rewinding")
- test.rewind()
+ test.rewind()
# initiate the program
Modified: torflow/trunk/NetworkScanners/soatstats.py
===================================================================
--- torflow/trunk/NetworkScanners/soatstats.py 2009-02-11 08:04:46 UTC (rev 18483)
+++ torflow/trunk/NetworkScanners/soatstats.py 2009-02-11 11:27:48 UTC (rev 18484)
@@ -99,54 +99,6 @@
if node.counts[test].inconclusive != 0:
print `node.idhex` + "\t" + `node.counts[test].inconclusive`
-
- # False positive test left in for verifcation and tweaking
- # TODO: Remove this bit eventually
- for result in data:
- if result.__class__.__name__ == "HtmlTestResult":
- if not result.tags_old or not result.tags or not result.exit_tags:
- continue
- print result.exit_node
-
- print result.tags
- print result.tags_old
- print result.exit_tags
-
- new_soup = BeautifulSoup(open(result.tags, "r").read())
- old_soup = BeautifulSoup(open(result.tags_old, "r").read())
- tor_soup = BeautifulSoup(open(result.exit_tags, "r").read())
-
- new_vs_old = SoupDiffer(new_soup, old_soup)
- old_vs_new = SoupDiffer(old_soup, new_soup)
- new_vs_tor = SoupDiffer(new_soup, tor_soup)
-
- # I'm an evil man and I'm going to CPU hell..
- changed_tags = old_vs_new.changed_tags_with_attrs()
- changed_tags.update(new_vs_old.changed_tags_with_attrs())
-
- changed_attributes = old_vs_new.changed_attributes_by_tag()
- changed_attributes.update(new_vs_old.changed_attributes_by_tag())
-
- changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
-
- # Verify all of our changed tags are present here
- # XXX: Have this print out more info on changed tags..
- if new_vs_tor.has_more_changed_tags(changed_tags) or \
- new_vs_tor.has_more_changed_attrs(changed_attributes) or \
- new_vs_tor.changed_content() and not changed_content:
- false_positive = False
- else:
- false_positive = True
-
- if false_positive:
- # Use http://codespeak.net/pypy/dist/pypy/lang/js/ to parse
- # links and attributes that contain javascript
- jsdiff = JSSoupDiffer(old_soup)
- jsdiff.prune_differences(new_soup)
- false_positive = not jsdiff.contains_differences(tor_soup)
-
- print false_positive
-
print ""
if __name__ == "__main__":