[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r18340: {torflow} We now diff Javascript in soat. (torflow/trunk/NetworkScanners)
Author: mikeperry
Date: 2009-01-30 09:21:17 -0500 (Fri, 30 Jan 2009)
New Revision: 18340
Modified:
torflow/trunk/NetworkScanners/libsoat.py
torflow/trunk/NetworkScanners/soat.py
torflow/trunk/NetworkScanners/soatstats.py
Log:
We now diff Javascript in soat.
Modified: torflow/trunk/NetworkScanners/libsoat.py
===================================================================
--- torflow/trunk/NetworkScanners/libsoat.py 2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/libsoat.py 2009-01-30 14:21:17 UTC (rev 18340)
@@ -8,27 +8,23 @@
import pickle
import sys
import time
-import difflib
+import traceback
sys.path.append("./libs")
from BeautifulSoup.BeautifulSoup import BeautifulSoup, Tag
+
import sets
from sets import Set
-#
-# Data storage
-#
+from soat_config import *
+sys.path.append("../")
+from TorCtl.TorUtil import *
-# data locations
+sys.path.append("./libs/pypy-svn/")
+import pypy.rlib.parsing.parsing
+import pypy.lang.js.jsparser
-data_dir = './data/soat/'
-ssl_certs_dir = data_dir + 'ssl/certs/'
-http_data_dir = data_dir + 'http/'
-http_content_dir = data_dir + 'http/content/'
-http_failed_dir = data_dir + 'http/failed/'
-http_inconclusive_dir = data_dir + 'http/inconclusive/'
-
# constants
TEST_SUCCESS = 0
@@ -291,6 +287,32 @@
ret.sort()
return ret
+ def changed_tags_with_attrs(self):
+ """ Create a map of changed tags to ALL attributes that tag
+ has ever had (changed or not) """
+ changed_tags = {}
+ for tags in map(BeautifulSoup, self.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in changed_tags:
+ changed_tags[t.name] = sets.Set([])
+ for attr in t.attrs:
+ changed_tags[t.name].add(attr[0])
+ return changed_tags
+
+ def has_more_changed_tags(self, tag_attr_map):
+ """ Returns true if we have additional tags with additional
+ attributes that were not present in tag_attr_map
+ (returned from changed_tags_with_attrs) """
+ for tags in map(BeautifulSoup, self.changed_tags()):
+ for t in tags.findAll():
+ if t.name not in tag_attr_map:
+ return True
+ else:
+ for attr in t.attrs:
+ if attr[0] not in tag_attr_map[t.name]:
+ return True
+ return False
+
def _get_attributes(self):
attrs_old = [(tag.name, tag.attrs) for tag in self.soup_old.findAll()]
attrs_new = [(tag.name, tag.attrs) for tag in self.soup_new.findAll()]
@@ -311,6 +333,29 @@
ret.sort()
return ret
+ def changed_attributes_by_tag(self):
+ """ Transform the list of (tag, attribute) pairings for new/changed
+ attributes into a map. This allows us to quickly see
+ if any attributes changed for a specific tag. """
+ changed_attributes = {}
+ for (tag, attr) in self.changed_attributes():
+ if tag not in changed_attributes:
+ changed_attributes[tag] = sets.Set([])
+ changed_attributes[tag].add(attr[0])
+ return changed_attributes
+
+ def has_more_changed_attrs(self, attrs_by_tag):
+ """ Returns true if we have any tags with additional
+ changed attributes that were not present in attrs_by_tag
+ (returned from changed_attributes_by_tag) """
+ for (tag, attr) in self.changed_attributes():
+ if tag in attrs_by_tag:
+ if attr[0] not in attrs_by_tag[tag]:
+ return True
+ else:
+ return True
+ return False
+
def changed_content(self):
""" Return a list of tag contents changed in soup_new """
tags_old = sets.Set(map(str,
@@ -321,29 +366,6 @@
ret.sort()
return ret
- def diff_tags(self):
- tags_old = map(str, [tag for tag in self.soup_old.findAll() if isinstance(tag, Tag)])
- tags_new = map(str, [tag for tag in self.soup_new.findAll() if isinstance(tag, Tag)])
- tags_old.sort()
- tags_new.sort()
- diff = difflib.SequenceMatcher(None, tags_old, tags_new)
- return diff
-
- def diff_attributes(self):
- (attr_old, attr_new) = self._get_attributes()
- attr_old.sort()
- attr_new.sort()
- diff = difflib.SequenceMatcher(None, attr_old, attr_new)
- return diff
-
- def diff_content(self):
- tags_old = sets.Set(map(str,
- [tag for tag in self.soup_old.findAll() if not isinstance(tag, Tag)]))
- tags_new = sets.Set(map(str,
- [tag for tag in self.soup_new.findAll() if not isinstance(tag, Tag)]))
- diff = difflib.SequenceMatcher(None, tags_old, tags_new)
- return diff
-
def __str__(self):
tags = self.changed_tags()
out = "Tags:\n"+"\n".join(tags)
@@ -359,4 +381,109 @@
f = open(outfile, "w")
f.write(str(self))
f.close()
+
+
+class JSDiffer:
+ def __init__(self, js_string):
+ self.ast_cnts = self.count_ast_elements(js_string)
+
+ def _ast_recursive_worker(ast, ast_cnts):
+ if not ast.symbol in ast_cnts:
+ ast_cnts[ast.symbol] = 1
+ else: ast_cnts[ast.symbol] += 1
+ if isinstance(ast, pypy.rlib.parsing.tree.Nonterminal):
+ for child in ast.children:
+ JSDiffer._ast_recursive_worker(child, ast_cnts)
+ _ast_recursive_worker = Callable(_ast_recursive_worker)
+ def count_ast_elements(self, js_string, name="global"):
+ ast_cnts = {}
+ try:
+ ast = pypy.lang.js.jsparser.parse(js_string)
+ JSDiffer._ast_recursive_worker(ast, ast_cnts)
+ except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError), e:
+ # Store info about the name and type of parse error
+ # so we can match that up too.
+ name+=":"+e.__class__.__name__
+ if "source_pos" in e.__dict__:
+ name+=":"+str(e.source_pos)
+ plog("INFO", "Parse error "+name+" on "+js_string)
+ if not "ParseError:"+name in ast_cnts:
+ ast_cnts["ParseError:"+name] = 1
+ else: ast_cnts["ParseError:"+name] +=1
+ return ast_cnts
+
+ def _difference_pruner(self, other_cnts):
+ for node in self.ast_cnts.iterkeys():
+ if node not in other_cnts:
+ self.ast_cnts[node] = 0
+ elif self.ast_cnts[node] != other_cnts[node]:
+ self.ast_cnts[node] = 0
+ for node in other_cnts.iterkeys():
+ if node not in self.ast_cnts:
+ self.ast_cnts[node] = 0
+
+ def _difference_checker(self, other_cnts):
+ for node in self.ast_cnts.iterkeys():
+ if not self.ast_cnts[node]: continue # pruned difference
+ if node not in other_cnts:
+ return True
+ elif self.ast_cnts[node] != other_cnts[node]:
+ return True
+ for node in other_cnts.iterkeys():
+ if node not in self.ast_cnts:
+ return True
+ return False
+
+ def prune_differences(self, other_string):
+ other_cnts = self.count_ast_elements(other_string)
+ self._difference_pruner(other_cnts)
+
+ def contains_differences(self, other_string):
+ other_cnts = self.count_ast_elements(other_string)
+ return self._difference_checker(other_cnts)
+
+class JSSoupDiffer(JSDiffer):
+ def _add_cnts(tag_cnts, ast_cnts):
+ ret_cnts = {}
+ for n in tag_cnts.iterkeys():
+ if n in ast_cnts:
+ ret_cnts[n] = tag_cnts[n]+ast_cnts[n]
+ else:
+ ret_cnts[n] = tag_cnts[n]
+ for n in ast_cnts.iterkeys():
+ if n not in tag_cnts:
+ ret_cnts[n] = ast_cnts[n]
+ return ret_cnts
+ _add_cnts = Callable(_add_cnts)
+
+ def count_ast_elements(self, soup, name="Soup"):
+ ast_cnts = {}
+ for tag in soup.findAll():
+ if tag.name == 'script':
+ for child in tag.childGenerator():
+ if isinstance(child, Tag):
+ plog("ERROR", "Script tag with subtag!")
+ else:
+ tag_cnts = JSDiffer.count_ast_elements(self, str(child), tag.name)
+ ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
+ for attr in tag.attrs:
+ # hrmm.. %-encoding too? Firefox negs on it..
+ parse = ""
+ if attr[1].replace(" ","")[:11] == "javascript:":
+ split_at = attr[1].find(":")+1
+ parse = str(attr[1][split_at:])
+ elif attr[0] in attrs_with_raw_script_map:
+ parse = str(attr[1])
+ if not parse: continue
+ tag_cnts = JSDiffer.count_ast_elements(self,parse,tag.name+":"+attr[0])
+ ast_cnts = JSSoupDiffer._add_cnts(tag_cnts, ast_cnts)
+ return ast_cnts
+
+ def prune_differences(self, other_soup):
+ other_cnts = self.count_ast_elements(other_soup)
+ self._difference_pruner(other_cnts)
+
+ def contains_differences(self, other_soup):
+ other_cnts = self.count_ast_elements(other_soup)
+ return self._difference_checker(other_cnts)
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/soat.py 2009-01-30 14:21:17 UTC (rev 18340)
@@ -43,7 +43,6 @@
import cookielib
import sha
import Queue
-import difflib
from libsoat import *
@@ -64,162 +63,11 @@
from SocksiPy import socks
import Pyssh.pyssh
-#
-# config stuff
-#
+from soat_config import *
-# these are used when searching for 'random' urls for testing
-wordlist_file = './wordlist.txt';
-# Hrmm.. Too many of these and Google really h8s us..
-scan_filetypes = ['exe','pdf','doc','msi']#,'rpm','dmg','pkg','dpkg']
-
-# Avoid vmware images+isos plz. Nobody could possibly have the patience
-# to download anything much larger than 30MB over Tor anyways ;)
-# XXX: 30MB?? Who the hell am I kidding. For testing this needs to be like 1MB
-max_content_size = 1024*1024 # 30*1024*1024
-
-# Kill fetches if they drop below 1kbyte/sec
-min_rate=1024
-
-
-firefox_headers = {
- 'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5',
- 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
- 'Accept-Language':"en-us,en;q=0.5",
- 'Accept-Encoding':"gzip,deflate",
- 'Accept-Charset': "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
- 'Keep-Alive':"300",
- 'Connection':"keep-alive"
-}
-
-# http://www.voidspace.org.uk/python/articles/cookielib.shtml
-search_cookie_file="search_cookies.lwp"
search_cookies=None
-yahoo_search_mode = {"host" : "search.yahoo.com", "query":"p", "filetype": "originurlextension:", "inurl":None, "class":"yschttl", "useragent":False}
-google_search_mode = {"host" : "www.google.com", "query":"q", "filetype":"filetype:", "inurl":"inurl:", "class" : "l", "useragent":True}
-
-# FIXME: This does not affect the ssl search.. no other search engines have
-# a working "inurl:" that allows you to pick the scheme to be https like google...
-default_search_mode = google_search_mode
-
-# ports to test in the consistency test
-
-ports_to_check = [
- ["pop", ExitPolicyRestriction('255.255.255.255', 110), "pops", ExitPolicyRestriction('255.255.255.255', 995)],
- ["imap", ExitPolicyRestriction('255.255.255.255', 143), "imaps", ExitPolicyRestriction('255.255.255.255', 993)],
- ["telnet", ExitPolicyRestriction('255.255.255.255', 23), "ssh", ExitPolicyRestriction('255.255.255.255', 22)],
- ["smtp", ExitPolicyRestriction('255.255.255.255', 25), "smtps", ExitPolicyRestriction('255.255.255.255', 465)],
- ["http", ExitPolicyRestriction('255.255.255.255', 80), "https",
-ExitPolicyRestriction('255.255.255.255', 443)],
- ["email", NodeRestrictionList([
-ExitPolicyRestriction('255.255.255.255',110),
-ExitPolicyRestriction('255.255.255.255',143)
-]),
-"secure email",
-OrNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',995),
-ExitPolicyRestriction('255.255.255.255',993),
-ExitPolicyRestriction('255.255.255.255',465),
-ExitPolicyRestriction('255.255.255.255',587)
-])],
- ["plaintext", AtLeastNNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',110),
-ExitPolicyRestriction('255.255.255.255',143),
-ExitPolicyRestriction('255.255.255.255',23),
-ExitPolicyRestriction('255.255.255.255',21),
-ExitPolicyRestriction('255.255.255.255',80)
-#ExitPolicyRestriction('255.255.255.255',25),
-], 4),
-"secure",
-OrNodeRestriction([
-ExitPolicyRestriction('255.255.255.255',995),
-ExitPolicyRestriction('255.255.255.255',993),
-ExitPolicyRestriction('255.255.255.255',22),
-ExitPolicyRestriction('255.255.255.255',465),
-ExitPolicyRestriction('255.255.255.255',587),
-ExitPolicyRestriction('255.255.255.255',443)
-])]
-]
-
#
-# non-public IPv4 address ranges network portions
-# refer to: www.iana.org/assignments/ipv4-address-space, www.iana.org/assignments/multicast-addresses
-#
-ipv4_nonpublic = [
- '00000000', # default route and its network: 0.0.0.0/8
- '00001010', # private 10.0.0.0/8
- '01111111', # loopback 127.0.0.0/8
- '1010100111111110', # link-local 169.254.0.0/16
- '101011000001', # private 172.16.0.0/12
- '1100000010101000', # private 192.168.0.0/16
- '111' # multicast & experimental 224.0.0.0/3
-]
-
-# Tags and attributes to check in the http test.
-# The general idea is to grab tags with attributes known
-# to either hold script, or cause automatic network actvitity
-# Note: the more we add, the greater the potential for false positives...
-# We also only care about the ones that work for FF2/FF3.
-
-# TODO: If we cut down on these tags, we can cut down on false
-# positives. The ultimate acid test would be to have two different Google
-# queries come back with the same tag structure after filtering them.
-# Unfortunately, Google munges its javascript, so we have to do
-# some more advanced processing to reach that goal..
-# Also, I'm somewhat torn on dropping 'a' tags..
-tags_to_check = ['a', 'applet', 'area', 'base', 'embed', 'form',
- 'frame', 'iframe', 'img', 'input', 'link', 'meta',
- 'object', 'script', 'style', 'layer', 'ilayer']
-tags_preserve_inner = ['script','style']
-
-# Merged from:
-# http://www.w3.org/TR/REC-html40/index/attributes.html
-# http://www.w3.org/TR/REC-html40/index/elements.html
-# http://web.archive.org/web/20060113072810/www.mozilla.org/docs/dom/domref/dom_event_ref33.html
-# http://scrivna.com/blog/2008/09/18/php-xss-filtering-function/
-# https://svn.typo3.org/TYPO3v4/Core/trunk/typo3/contrib/RemoveXSS/RemoveXSS.php
-# http://www.expertzzz.com/Downloadz/view/3424
-# http://kallahar.com/smallprojects/php_xss_filter_function.php
-# and http://ha.ckers.org/xss.html
-attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
-'longdesc', 'onabort', 'onactivate', 'onafterprint', 'onafterupdate',
-'onattrmodified', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
-'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
-'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'onbroadcast',
-'oncellchange', 'onchange', 'oncharacterdatamodified', 'onclick', 'onclose',
-'oncommand', 'oncommandupdate', 'oncontextmenu', 'oncontrolselect', 'oncopy',
-'oncut', 'ondataavaible', 'ondataavailable', 'ondatasetchanged',
-'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop',
-'ondragend', 'ondragenter', 'ondragexit', 'ondraggesture', 'ondragleave',
-'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate',
-'onfilterchange', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin',
-'onfocusout', 'onhelp', 'oninput', 'onkeydown', 'onkeypress', 'onkeyup',
-'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter',
-'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
-'onmousewheel', 'onmove', 'onmoveend', 'onmoveout', 'onmovestart',
-'onnodeinserted', 'onnodeinsertedintodocument', 'onnoderemoved',
-'onnoderemovedfromdocument', 'onoverflowchanged', 'onpaint', 'onpaste',
-'onpopupHidden', 'onpopupHiding', 'onpopupShowing', 'onpopupShown',
-'onpropertychange', 'onreadystatechange', 'onreset', 'onresize',
-'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete',
-'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange',
-'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onsubtreemodified',
-'ontext', 'onunderflow', 'onunload', 'overflow', 'profile', 'src', 'style',
-'usemap']
-attrs_to_check_map = {}
-for __a in attrs_to_check: attrs_to_check_map[__a]=1
-attrs_to_prune = ['alt', 'label', 'prompt' 'standby', 'summary', 'title',
- 'abbr']
-
-# For recursive fetching of urls:
-tags_to_recurse = ['a', 'applet', 'embed', 'frame', 'iframe', #'img',
- 'link', 'object', 'script', 'layer', 'ilayer']
-recurse_html = ['frame', 'iframe', 'layer', 'ilayer']
-attrs_to_recurse = ['background', 'codebase', 'data', 'href',
- 'pluginurl', 'src']
-
-#
# constants
#
@@ -317,6 +165,9 @@
plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets)
self.tests_run = 0
self.nodes_marked = 0
+ # XXX: We really need to register an eventhandler
+ # and register a callback for it when this list
+ # changes due to dropping either "Running" or "Fast"
self.nodes = self.mt.get_nodes_for_port(self.port)
self.node_map = {}
for n in self.nodes:
@@ -752,6 +603,7 @@
for a in t.attrs:
attr_name = str(a[0])
attr_tgt = str(a[1])
+ # TODO: Split off javascript
if attr_name in attrs_to_recurse:
if str(t.name) in recurse_html:
plog("NOTICE", "Adding html "+str(t.name)+" target: "+attr_tgt)
@@ -803,6 +655,7 @@
return soup
def check_html(self, address):
+ # XXX: Check mimetype to decide what to do..
''' check whether a http connection to a given address is molested '''
plog('INFO', 'Conducting an html test with destination ' + address)
@@ -976,59 +829,29 @@
new_vs_old = SoupDiffer(soup_new, soup)
new_vs_tor = SoupDiffer(soup_new, psoup)
- # TODO: Consider storing these changing attributes
- # for more than just this run..
- # FIXME: Also consider refactoring this into SoupDiffer.
- # It's kind of a mess..
- changed_tags = {}
- changed_attributes = {}
# I'm an evil man and I'm going to CPU hell..
- for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- changed_tags[t.name] = sets.Set([])
- for attr in t.attrs:
- changed_tags[t.name].add(attr[0])
- for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- changed_tags[t.name] = sets.Set([])
- for attr in t.attrs:
- changed_tags[t.name].add(attr[0])
- for (tag, attr) in old_vs_new.changed_attributes():
- if tag not in changed_attributes:
- changed_attributes[tag] = {}
- changed_attributes[tag][attr[0]] = 1
- for (tag, attr) in new_vs_old.changed_attributes():
- changed_attributes[attr[0]] = 1
- if tag not in changed_attributes:
- changed_attributes[tag] = {}
- changed_attributes[tag][attr[0]] = 1
-
- changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+ changed_tags = old_vs_new.changed_tags_with_attrs()
+ changed_tags.update(new_vs_old.changed_tags_with_attrs())
- false_positive = True
- for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- false_positive = False
- else:
- for attr in t.attrs:
- if attr[0] not in changed_tags[t.name]:
- false_positive = False
- for (tag, attr) in new_vs_tor.changed_attributes():
- if tag in changed_attributes:
- if attr[0] not in changed_attributes[tag]:
- false_positive=False
- else:
- if not false_positive:
- plog("ERROR", "False positive contradiction at "+exit_node+" for "+address)
- false_positive = False
+ changed_attributes = old_vs_new.changed_attributes_by_tag()
+ changed_attributes.update(new_vs_old.changed_attributes_by_tag())
- if new_vs_tor.changed_content() and not changed_content:
+ changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
+
+ # Verify all of our changed tags are present here
+ if new_vs_tor.has_more_changed_tags(changed_tags) or \
+ new_vs_tor.has_more_changed_attrs(changed_attributes) or \
+ new_vs_tor.changed_content() and not changed_content:
false_positive = False
+ else:
+ false_positive = True
if false_positive:
+ jsdiff = JSSoupDiffer(soup)
+ jsdiff.prune_differences(soup_new)
+ false_positive = not jsdiff.contains_differences(psoup)
+
+ if false_positive:
plog("NOTICE", "False positive detected for dynamic change at "+address+" via "+exit_node)
result = HtmlTestResult(exit_node, address, TEST_SUCCESS)
self.results.append(result)
Modified: torflow/trunk/NetworkScanners/soatstats.py
===================================================================
--- torflow/trunk/NetworkScanners/soatstats.py 2009-01-30 14:17:02 UTC (rev 18339)
+++ torflow/trunk/NetworkScanners/soatstats.py 2009-01-30 14:21:17 UTC (rev 18340)
@@ -18,44 +18,6 @@
sys.path.append("../")
from TorCtl.TorUtil import *
-sys.path.append("./libs/pypy-svn/")
-import pypy.rlib.parsing.parsing
-import pypy.lang.js.jsparser
-
-attrs_with_raw_script = [
-'onabort', 'onactivate', 'onafterprint', 'onafterupdate',
-'onattrmodified', 'onbeforeactivate', 'onbeforecopy', 'onbeforecut',
-'onbeforedeactivate', 'onbeforeeditfocus', 'onbeforepaste', 'onbeforeprint',
-'onbeforeunload', 'onbeforeupdate', 'onblur', 'onbounce', 'onbroadcast',
-'oncellchange', 'onchange', 'oncharacterdatamodified', 'onclick', 'onclose',
-'oncommand', 'oncommandupdate', 'oncontextmenu', 'oncontrolselect', 'oncopy',
-'oncut', 'ondataavaible', 'ondataavailable', 'ondatasetchanged',
-'ondatasetcomplete', 'ondblclick', 'ondeactivate', 'ondrag', 'ondragdrop',
-'ondragend', 'ondragenter', 'ondragexit', 'ondraggesture', 'ondragleave',
-'ondragover', 'ondragstart', 'ondrop', 'onerror', 'onerrorupdate',
-'onfilterchange', 'onfilterupdate', 'onfinish', 'onfocus', 'onfocusin',
-'onfocusout', 'onhelp', 'oninput', 'onkeydown', 'onkeypress', 'onkeyup',
-'onlayoutcomplete', 'onload', 'onlosecapture', 'onmousedown', 'onmouseenter',
-'onmouseleave', 'onmousemove', 'onmouseout', 'onmouseover', 'onmouseup',
-'onmousewheel', 'onmove', 'onmoveend', 'onmoveout', 'onmovestart',
-'onnodeinserted', 'onnodeinsertedintodocument', 'onnoderemoved',
-'onnoderemovedfromdocument', 'onoverflowchanged', 'onpaint', 'onpaste',
-'onpopupHidden', 'onpopupHiding', 'onpopupShowing', 'onpopupShown',
-'onpropertychange', 'onreadystatechange', 'onreset', 'onresize',
-'onresizeend', 'onresizestart', 'onrowenter', 'onrowexit', 'onrowsdelete',
-'onrowsinserted', 'onscroll', 'onselect', 'onselectionchange',
-'onselectstart', 'onstart', 'onstop', 'onsubmit', 'onsubtreemodified',
-'ontext', 'onunderflow', 'onunload'
-]
-attrs_to_check = ['background', 'cite', 'classid', 'codebase', 'data',
-'longdesc', 'profile', 'src', 'style', 'usemap']
-attrs_to_check.extend(attrs_with_raw_script)
-attrs_to_check_map = {}
-for __a in attrs_to_check: attrs_to_check_map[__a]=1
-attrs_with_raw_script_map = {}
-for __a in attrs_with_raw_script: attrs_with_raw_script_map[__a]=1
-
-
class ResultCount:
def __init__(self, type):
self.type = type
@@ -158,148 +120,30 @@
old_vs_new = SoupDiffer(old_soup, new_soup)
new_vs_tor = SoupDiffer(new_soup, tor_soup)
- changed_tags = {}
- changed_attributes = {}
# I'm an evil man and I'm going to CPU hell..
- for tags in map(BeautifulSoup, old_vs_new.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- changed_tags[t.name] = sets.Set([])
- for attr in t.attrs:
- changed_tags[t.name].add(attr[0])
- for tags in map(BeautifulSoup, new_vs_old.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- changed_tags[t.name] = sets.Set([])
- for attr in t.attrs:
- changed_tags[t.name].add(attr[0])
- for (tag, attr) in old_vs_new.changed_attributes():
- if tag not in changed_attributes:
- changed_attributes[tag] = {}
- changed_attributes[tag][attr[0]] = 1
- for (tag, attr) in new_vs_old.changed_attributes():
- changed_attributes[attr[0]] = 1
- if tag not in changed_attributes:
- changed_attributes[tag] = {}
- changed_attributes[tag][attr[0]] = 1
-
+ changed_tags = old_vs_new.changed_tags_with_attrs()
+ changed_tags.update(new_vs_old.changed_tags_with_attrs())
+
+ changed_attributes = old_vs_new.changed_attributes_by_tag()
+ changed_attributes.update(new_vs_old.changed_attributes_by_tag())
+
changed_content = bool(old_vs_new.changed_content() or old_vs_new.changed_content())
-
- false_positive = True
- for tags in map(BeautifulSoup, new_vs_tor.changed_tags()):
- for t in tags.findAll():
- if t.name not in changed_tags:
- false_positive = False
- else:
- for attr in t.attrs:
- if attr[0] not in changed_tags[t.name]:
- false_positive = False
- for (tag, attr) in new_vs_tor.changed_attributes():
- if tag in changed_attributes:
- if attr[0] not in changed_attributes[tag]:
- false_positive=False
- else:
- if not false_positive:
- plog("ERROR", "False positive contradiction at "+exit_node+" for "+address)
- false_positive = False
-
- if new_vs_tor.changed_content() and not changed_content:
+
+ # Verify all of our changed tags are present here
+ if new_vs_tor.has_more_changed_tags(changed_tags) or \
+ new_vs_tor.has_more_changed_attrs(changed_attributes) or \
+ new_vs_tor.changed_content() and not changed_content:
false_positive = False
-
- def ast_recurse(ast, map):
- if not ast.symbol in map:
- map[ast.symbol] = 1
- else: map[ast.symbol] += 1
- if isinstance(ast, pypy.rlib.parsing.tree.Nonterminal):
- for child in ast.children:
- ast_recurse(child, map)
-
- def count_ast(map, tags):
- for tag_l in tags:
- for tag in tag_l.findAll():
- did_parse = False
- if tag.name == 'script':
- for child in tag.childGenerator():
- if isinstance(child, Tag):
- plog("ERROR", "Script tag with subtag!")
- else:
- try:
- did_parse = True
- ast = pypy.lang.js.jsparser.parse(str(child))
- ast_recurse(ast, map)
- except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError):
- plog("NOTICE", "Parse error on "+str(child))
- if not "ParseError"+tag.name in map:
- map["ParseError"+tag.name] = 1
- else: map["ParseError"+tag.name] +=1
-
- for attr in tag.attrs:
- # XXX: %-encoding too
- parse = ""
- if attr[1].replace(" ","")[:11] == "javascript:":
- split_at = attr[1].find(":")+1
- parse = str(attr[1][split_at:])
- elif attr[0] in attrs_with_raw_script_map:
- parse = str(attr[1])
- if not parse: continue
- try:
- did_parse = True
- ast = pypy.lang.js.jsparser.parse(parse)
- ast_recurse(ast, map)
- except (pypy.rlib.parsing.deterministic.LexerError, UnicodeDecodeError, pypy.rlib.parsing.parsing.ParseError):
- plog("NOTICE", "Parse error on "+parse+" in "+attr[0]+"="+attr[1])
- if not "ParseError"+tag.name+attr[0] in map:
- map["ParseError"+tag.name+attr[0]] = 1
- else: map["ParseError"+attr[0]] +=1
+ else:
+ false_positive = True
if false_positive:
# Use http://codespeak.net/pypy/dist/pypy/lang/js/ to parse
# links and attributes that contain javascript
+ jsdiff = JSSoupDiffer(old_soup)
+ jsdiff.prune_differences(new_soup)
+ false_positive = not jsdiff.contains_differences(tor_soup)
- old_vs_new_cnt = {}
- count_ast(old_vs_new_cnt, [old_soup])
-
- new_vs_old_cnt = {}
- count_ast(new_vs_old_cnt, [new_soup])
-
- # for each changed tag, count all tree elements in a hash table.
- # Then, compare the counts between the two fetches
- # If any count changes, mark its count as -1
- # Make sure the terminal counts of the tor fetch match
- # except for the -1 terminals
-
- for node in old_vs_new_cnt.iterkeys():
- if node not in new_vs_old_cnt:
- plog("INFO", "Javascript AST element "+node+" absent..")
- new_vs_old_cnt[node] = 0
- elif new_vs_old_cnt[node] != old_vs_new_cnt[node]:
- plog("INFO", "Javascript AST count differs for "+node+": "+str(new_vs_old_cnt[node])+" vs "+str(old_vs_new_cnt[node]))
- new_vs_old_cnt[node] = 0
-
- for node in new_vs_old_cnt.iterkeys():
- if node not in old_vs_new_cnt:
- plog("INFO", "Javascript AST element "+node+" absent..")
- new_vs_old_cnt[node] = 0
-
- new_vs_tor_cnt = {}
- count_ast(new_vs_tor_cnt, [tor_soup])
-
- for node in new_vs_old_cnt.iterkeys():
- if not new_vs_old_cnt[node]:
- continue
- if node not in new_vs_tor_cnt:
- plog("ERROR", "Javascript AST element "+node+" absent from Tor.")
- false_positive = False
- elif new_vs_old_cnt[node] != new_vs_tor_cnt[node]:
- plog("ERROR", "Javascript AST count differs for "+node+": "+str(new_vs_old_cnt[node])+" vs "+str(new_vs_tor_cnt[node]))
- false_positive = False
-
- for node in new_vs_tor_cnt.iterkeys():
- if node not in new_vs_old_cnt:
- plog("ERROR", "Javascript AST element "+node+" present only in Tor")
- false_positive = False
-
-
print false_positive
print ""