[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[or-cvs] r19240: {torflow} Change recursion behavior to work better with link tags and (torflow/trunk/NetworkScanners)
Author: mikeperry
Date: 2009-04-08 00:52:41 -0400 (Wed, 08 Apr 2009)
New Revision: 19240
Modified:
torflow/trunk/NetworkScanners/soat.py
Log:
Change recursion behavior to work better with link tags
and to avoid crawl-loops.
Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py 2009-04-08 04:42:48 UTC (rev 19239)
+++ torflow/trunk/NetworkScanners/soat.py 2009-04-08 04:52:41 UTC (rev 19240)
@@ -433,19 +433,23 @@
port = netloc[netloc.rfind(":")+1:]
try:
if int(port) != self.port:
+ plog("DEBUG", "Unusable port "+port+" in "+url)
return False
except:
traceback.print_exc()
plog("WARN", "Unparseable port "+port+" in "+url)
return False
if valid_schemes and scheme not in valid_schemes:
+ plog("DEBUG", "Unusable scheme "+scheme+" in "+url)
return False
if url in self.banned_targets:
+ plog("DEBUG", "Banned url "+url)
return False
if filetypes: # Must be checked last
for filetype in filetypes:
if url[-len(filetype):] == filetype:
return True
+ plog("DEBUG", "Bad filetype for "+url)
return False
return True
@@ -1001,14 +1005,19 @@
plog("INFO", "Chose random referer "+first_referer)
self.tests_run += 1
- # TODO: Watch for spider-traps! (ie mutually sourcing iframes)
+ address = random.choice(self.targets)
+
# Keep a trail log for this test and check for loops
- address = random.choice(self.targets)
+ fetched = sets.Set([])
self.fetch_queue.append(("html", address, first_referer))
n_success = n_fail = n_inconclusive = 0
while self.fetch_queue:
(test, url, referer) = self.fetch_queue.pop(0)
+ if url in fetched:
+ plog("INFO", "Already fetched "+url+", skipping")
+ continue
+ fetched.add(url)
if use_referers and referer: self.headers['Referer'] = referer
# Technically both html and js tests check and dispatch via mime types
# but I want to know when link tags lie
@@ -1047,6 +1056,7 @@
# Only pull at most one filetype from the list of 'a' links
targets = []
got_type = {}
+ found_favicon = False
# Hrmm, if we recursively strained only these tags, this might be faster
for tag in tags_to_recurse:
tags = soup.findAll(tag)
@@ -1061,11 +1071,21 @@
elif t.name in recurse_script:
if t.name == "link":
for a in t.attrs:
- #if a[0] == "type" and a[1] in script_mime_types:
- plog("INFO", "Adding link script for: "+str(t))
- targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
+ # Special case CSS and favicons
+ if (a[0] == "type" and a[1] == "text/css") or \
+ ((a[0] == "rel" or a[0] == "rev") and a[1] == "stylesheet"):
+ plog("INFO", "Adding CSS of: "+str(t))
+ targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
+ elif (a[0] == "rel" or a[0] == "rev") and \
+ ("shortcut" in a[1] or "icon" in a[1]):
+ plog("INFO", "Adding favicon of: "+str(t))
+ found_favicon = True
+ targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
+ elif a[0] == "type" and a[1] in script_mime_types:
+ plog("INFO", "Adding link script of: "+str(t))
+ targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
else:
- plog("INFO", "Adding script tag for: "+str(t))
+ plog("INFO", "Adding script tag of: "+str(t))
targets.append(("js", urlparse.urljoin(orig_addr, attr_tgt)))
elif t.name == 'a':
if attr_name == "href":
@@ -1075,7 +1095,15 @@
targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
else:
targets.append(("http", urlparse.urljoin(orig_addr, attr_tgt)))
- for i in sets.Set(targets):
+
+ if not found_favicon:
+ targets.insert(0, ("http", urlparse.urljoin(orig_addr, "/favicon.ico")))
+
+ loaded = sets.Set([])
+
+ for i in targets:
+ if i[1] in loaded: continue
+ loaded.add(i[1])
if self._is_useable_url(i[1], html_schemes):
plog("NOTICE", "Adding "+i[0]+" target: "+i[1])
self.fetch_queue.append((i[0], i[1], orig_addr))