[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [torflow/master] Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run.
commit 0c10b5be192f96e75db45c4a32c48248da8ee513
Author: christian <christian@xxxxxxxxx>
Date: Thu Jun 2 15:05:40 2011 -0400
Incorporating new container for targets. SearchBasedHTTPTest and FixedTargetHTTPTest now seem to run.
---
NetworkScanners/ExitAuthority/soat.py | 204 +++++++++++++++-----------------
1 files changed, 96 insertions(+), 108 deletions(-)
diff --git a/NetworkScanners/ExitAuthority/soat.py b/NetworkScanners/ExitAuthority/soat.py
index c96468f..8d557eb 100755
--- a/NetworkScanners/ExitAuthority/soat.py
+++ b/NetworkScanners/ExitAuthority/soat.py
@@ -528,9 +528,10 @@ class Targets:
MUST support these methods:
add -- Add a target. Optional second argument is list of keys. Idempotent.
remove -- Remove a target. Returns True iff the target was found.
- bykey -- Get an iterator whose elements match the supplied key.
+ bykey -- Get a list whose elements match the supplied key.
__iter__
__len__
+ __getitem__
"""
def __init__(self):
@@ -564,15 +565,20 @@ class Targets:
return retval
def bykey(self,key):
return self.lookup.get(key,[])
+ def keys(self):
+ return self.lookup.keys()
def __iter__(self):
return map(lambda x: x[0], self.list).__iter__()
def __len__(self):
return len(self.list)
+ def __getitem__(self,index):
+ return self.list[index]
# Base Test Classes
class Test:
""" Base class for our tests """
def __init__(self, proto, port):
+ """Sets the variables that are static for the lifetime of the test and calls self._reset() which sets the variables that are not."""
self.proto = proto
self.port = port
self.min_targets = min_targets
@@ -585,8 +591,7 @@ class Test:
self.scan_nodes = 0
self.nodes_to_mark = 0
self.tests_per_node = num_tests_per_node
- self.url_reserve = {}
- self._reset()
+ self._reset() #CA make this a call to rewind instead?
self._pickle_revision = 8 # Will increment as fields are added
def run_test(self):
@@ -656,13 +661,19 @@ class Test:
return True
def add_target(self, target):
- self.targets.append(target)
+ self.targets.add(target)
+
+ def select_targets(self):
+ return self.targets
+
+ def refill_targets(self):
+ map(self.add_target, self.get_targets())
+ if not self.targets:
+ raise NoURLsFound("No URLS found for protocol "+self.proto)
def remove_target(self, target, reason="None"):
self.banned_targets.add(target)
- self.refill_targets()
- if target in self.targets:
- self.targets.remove(target)
+ self.targets.remove(target)
if target in self.dynamic_fails:
del self.dynamic_fails[target]
if target in self.successes:
@@ -693,6 +704,8 @@ class Test:
datahandler.saveResult(r)
self.results.remove(r)
+ self.refill_targets()
+
def load_rescan(self, type, since=None):
self.rescan_nodes = set([])
results = datahandler.getAll()
@@ -817,7 +830,7 @@ class Test:
def _reset(self):
self.results = []
- self.targets = []
+ self.targets = Targets()
self.tests_run = 0
self.nodes_marked = 0
self.run_start = time.time()
@@ -827,7 +840,7 @@ class Test:
self.dns_fails_per_exit = {}
self.exit_fails_per_exit = {}
self.node_results = {}
- # These are indexed by site url:
+ # These are indexed by target URI:
self.connect_fails = {}
self.timeout_fails = {}
self.dns_fails = {}
@@ -842,8 +855,8 @@ class Test:
if not self.targets:
raise NoURLsFound("No URLS found for protocol "+self.proto)
- targets = "\n\t".join(self.targets)
- plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets)
+ targets_str = "\n\t".join(map(str,self.targets))
+ plog("INFO", "Using the following urls for "+self.proto+" scan:\n\t"+targets_str)
def site_tests(self, site):
tot_cnt = 0
@@ -981,17 +994,16 @@ class Test:
class BaseHTTPTest(Test):
- def __init__(self, filetypes=scan_filetypes):
+ def __init__(self, scan_filetypes=scan_filetypes):
# FIXME: Handle http urls w/ non-80 ports..
- self.scan_filetypes = filetypes
+ self.scan_filetypes = scan_filetypes
+ self.fetch_queue = []
Test.__init__(self, "HTTP", 80)
self.save_name = "HTTPTest"
- self.fetch_targets = urls_per_filetype
def _reset(self):
self.httpcode_fails = {}
self.httpcode_fails_per_exit = {}
- self.targets_by_type = {}
Test._reset(self)
def depickle_upgrade(self):
@@ -1034,15 +1046,13 @@ class BaseHTTPTest(Test):
self.tests_run += 1
- n_tests = random.choice(xrange(1,len(self.targets_by_type)+1))
- filetypes = random.sample(self.targets_by_type.keys(), n_tests)
-
- plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+ self.fetch_queue.extend(self.select_targets())
n_success = n_fail = n_inconclusive = 0
- for ftype in filetypes:
+
+ while self.fetch_queue:
+ address = self.fetch_queue.pop(0)
# FIXME: Set referrer to random or none for each of these
- address = random.choice(self.targets_by_type[ftype])
result = self.check_http(address)
if result == TEST_INCONCLUSIVE:
n_inconclusive += 1
@@ -1062,22 +1072,9 @@ class BaseHTTPTest(Test):
else:
return TEST_SUCCESS
- def add_target(self, target):
- # HTTP Tests keep an additional dictionary of targets keyed by filetype
- split = target.rsplit('.',1)
- if len(split) > 1 and split[-1] in self.scan_filetypes:
- self.targets.append(target)
- self.targets_by_type.setdefault(split[-1], []).append(target)
-
def remove_target(self, target, reason="None"):
# Remove from targets list and targets by type dictionary
- if target in self.targets:
- self.targets.remove(target)
- for k,v in self.targets_by_type.items():
- if target in v:
- v.remove(target)
- if not v:
- del self.targets_by_type[k]
+ self.targets.remove(target)
# Delete results in httpcode_fails
if target in self.httpcode_fails:
del self.httpcode_fails[target]
@@ -1488,18 +1485,10 @@ def is_script_mimetype(mime_type):
return is_script
class BaseHTMLTest(BaseHTTPTest):
- def __init__(self, recurse_filetypes=scan_filetypes):
- BaseHTTPTest.__init__(self, recurse_filetypes)
+ def __init__(self, scan_filetypes=scan_filetypes):
+ BaseHTTPTest.__init__(self, scan_filetypes)
self.save_name = "HTMLTest"
- self.fetch_targets = num_html_urls
- self.proto = "HTML"
- self.recurse_filetypes = recurse_filetypes
- self.fetch_queue = []
-
- def _reset(self):
- self.httpcode_fails = {}
- self.httpcode_fails_per_exit = {}
- Test._reset(self)
+ self.proto = "HTML" #CA .. ?
def depickle_upgrade(self):
if self._pickle_revision < 7:
@@ -1507,11 +1496,9 @@ class BaseHTMLTest(BaseHTTPTest):
Test.depickle_upgrade(self)
def add_target(self, target):
+ """Avoid BaseHTTP.add_target which keys entries"""
Test.add_target(self, target)
- def remove_target(self, target, reason="None"):
- Test.remove_target(self, target, reason)
-
def run_test(self):
# A single test should have a single cookie jar
self.tor_cookie_jar = cookielib.MozillaCookieJar()
@@ -1616,7 +1603,7 @@ class BaseHTMLTest(BaseHTTPTest):
targets.append(("image", urlparse.urljoin(orig_addr, attr_tgt)))
elif t.name == 'a':
if attr_name == "href":
- for f in self.recurse_filetypes:
+ for f in self.scan_filetypes:
if f not in got_type and attr_tgt[-len(f):] == f:
got_type[f] = 1
targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
@@ -2045,14 +2032,16 @@ class FixedTargetTest:
def __init__(self, targets):
self.fixed_targets = targets
- def refill_targets(self):
- pass
-
def get_targets(self):
return self.fixed_targets[:]
+ def refill_targets(self):
+ """Can't refill FixedTargetTest"""
+ pass
+
def finished(self):
- # FixedTargetTests are done if they test all nodes or run out of targets
+ """FixedTargetTests are done if they test all nodes or run out of targets"""
+ # CA do we properly handle possibility that self.targets can run out
return not (self.nodes and self.targets)
class FixedTargetHTTPTest(FixedTargetTest, BaseHTTPTest):
@@ -2081,17 +2070,12 @@ class SearchBasedTest:
""" Mixin class. Must be mixed with a subclass of Test """
def __init__(self, wordlist_file):
self.wordlist_file = wordlist_file
-
self.host_only = False
- self.result_filetypes = ['any']
- self.result_protocol = 'any'
- self.results_per_type = 10
self.search_mode = default_search_mode
+ self.url_reserve = {}
- def refill_targets(self):
- if len(self.targets) < self.min_targets:
- plog("NOTICE", self.proto+" scanner short on targets. Adding more")
- map(self.add_target, self.get_targets())
+ def rewind(self):
+ self.wordlist = load_wordlist(self.wordlist_file)
def get_targets(self):
return self.get_search_urls()
@@ -2103,26 +2087,23 @@ class SearchBasedTest:
plog('INFO', 'Searching for relevant sites...')
urllist = set([])
- for filetype in self.result_filetypes:
- type_urls = self.get_search_urls_for_filetype(filetype)
- # make sure we don't get more urls than needed
- if len(type_urls) > self.results_per_type:
- chosen_urls = set(random.sample(type_urls, self.results_per_type))
- if filetype in self.url_reserve:
- self.url_reserve[filetype].extend(list(type_urls - chosen_urls))
- else:
- self.url_reserve[filetype] = list(type_urls - chosen_urls)
- type_urls = chosen_urls
- urllist.update(type_urls)
+ for filetype in self.scan_filetypes:
+ urllist.update(self.get_search_urls_for_filetype(filetype))
return list(urllist)
- def get_search_urls_for_filetype(self, filetype):
- type_urls = set(self.url_reserve.get(filetype, []))
- if type_urls: # Clear urls from the reserve
- self.url_reserve[filetype] = []
+ def get_search_urls_for_filetype(self, filetype,number = 0):
+ if not number:
+ number = self.results_per_type
+
+ self.url_reserve.setdefault(filetype,[])
+
+ type_urls = set(self.url_reserve[filetype][:number])
+ self.url_reserve[filetype] = self.url_reserve[filetype][number:]
+
count = 0
- while len(type_urls) < self.results_per_type and count < max_search_retry:
+
+ while len(type_urls) < number and count < max_search_retry:
count += 1
#Try to filter based on filetype/protocol. Unreliable. We will re-filter.
@@ -2194,30 +2175,34 @@ class SearchBasedTest:
if filetype == 'any':
file_list = None
else:
- file_list = self.result_filetypes
+ file_list = self.scan_filetypes
if self._is_useable_url(url, prot_list, file_list):
if self.host_only:
# FIXME: %-encoding, @'s, etc?
plog("INFO", url)
- host = urlparse.urlparse(url)[1]
+ url = urlparse.urlparse(url)[1]
# Have to check again here after parsing the url:
- if host not in self.banned_targets:
- type_urls.add(host)
- else:
- type_urls.add(url)
+ if host in self.banned_targets:
+ continue
+ type_urls.add(url)
+ plog("INFO", "Have "+str(len(type_urls))+"/"+str(number)+" urls from search so far..")
else:
pass
- plog("INFO", "Have "+str(len(type_urls))+"/"+str(self.results_per_type)+" urls from search so far..")
+
+ if len(type_urls) > number:
+ chosen = random.sample(type_urls,number)
+ self.url_reserve[filetype].extend(list(type_urls - set(chosen)))
+ type_urls = chosen
+
return type_urls
class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
def __init__(self, wordlist):
BaseHTTPTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
- self.result_filetypes = self.scan_filetypes
- self.result_protocol = "http"
- self.results_per_type = self.fetch_targets
+ self.results_per_type = urls_per_filetype
+ self.result_protocol = 'http'
def depickle_upgrade(self):
if self._pickle_revision < 7:
@@ -2227,28 +2212,32 @@ class SearchBasedHTTPTest(SearchBasedTest, BaseHTTPTest):
BaseHTTPTest.depickle_upgrade(self)
def rewind(self):
- self.wordlist = load_wordlist(self.wordlist_file)
+ SearchBasedTest.rewind(self)
BaseHTTPTest.rewind(self)
+ def add_target(self, target):
+ # Keys targets by filetype. One filetype per target
+ split = target.rsplit('.',1)
+ if len(split) > 1 and split[-1] in self.scan_filetypes:
+ self.targets.add(target,[split[-1]])
+ return True
+ return False
+
+ def select_targets(self):
+ retval = []
+ n_tests = random.randrange(1,len(self.targets.keys())+1)
+ filetypes = random.sample(self.targets.keys(), n_tests)
+ plog("INFO", "HTTPTest decided to fetch "+str(n_tests)+" urls of types: "+str(filetypes))
+ for ftype in filetypes:
+ retval.append(random.choice(self.targets.bykey(ftype)))
+ return retval
+
def refill_targets(self):
for ftype in self.scan_filetypes:
- if not ftype in self.targets_by_type or len(self.targets_by_type[ftype]) < self.fetch_targets:
+ targets_needed = self.results_per_type - len(self.targets.bykey(ftype))
+ if targets_needed > 0:
plog("NOTICE", self.proto+" scanner short on "+ftype+" targets. Adding more")
- map(self.add_target, self.get_search_urls_for_filetype(ftype))
-
-# This duplicated the effort of BaseHTTPTest.add_target which is invoked by
-# SearchBasedHTTPTest.rewind -> BaseHTTPTest.rewind = Test.rewind
-# Instead we should fall back on SearchBasedTest.get_targets
-# def get_targets(self):
-# raw_urls = self.get_search_urls()
-# new = {}
-# for url in raw_urls:
-# split = url.rsplit('.',1) # Try to get filetype
-# if len(split) > 1 and split[-1] in self.scan_filetypes:
-# new.setdefault(split[-1],[]).append(url)
-# for k,v in new.items():
-# self.targets_by_type.setdefault(k, []).extend(v)
-# return raw_urls
+ map(self.add_target, self.get_search_urls_for_filetype(ftype,targets_needed))
HTTPTest = SearchBasedHTTPTest # For resuming from old HTTPTest.*.test files
@@ -2257,7 +2246,6 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
BaseHTMLTest.__init__(self)
SearchBasedTest.__init__(self, wordlist)
self.result_filetypes = ["any"]
- self.result_protocol = "http"
self.results_per_type = self.fetch_targets
def depickle_upgrade(self):
@@ -2268,7 +2256,7 @@ class SearchBasedHTMLTest(SearchBasedTest, BaseHTMLTest):
BaseHTMLTest.depickle_upgrade(self)
def rewind(self):
- self.wordlist = load_wordlist(self.wordlist_file)
+ SearchBasedTest.rewind(self)
BaseHTMLTest.rewind(self)
HTMLTest = SearchBasedHTMLTest # For resuming from old HTMLTest.*.test files
@@ -2908,7 +2896,7 @@ def decompress_response_data(response):
len_read = len(data)
now = time.time()
- plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len))
+ #plog("DEBUG", "Read "+str(len_read)+"/"+str(tot_len)) #Very verbose
# Wait 5 seconds before counting data
if (now-start) > 5:
rate = (float(len_read)/(now-start)) #B/s
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits