[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[or-cvs] r18262: {torflow} Add cookie support to url scraper. Fucking google captchas a (torflow/trunk/NetworkScanners)



Author: mikeperry
Date: 2009-01-23 20:07:03 -0500 (Fri, 23 Jan 2009)
New Revision: 18262

Modified:
   torflow/trunk/NetworkScanners/soat.py
Log:

Add cookie support to url scraper. Fucking google captchas
are the bande of my existence.



Modified: torflow/trunk/NetworkScanners/soat.py
===================================================================
--- torflow/trunk/NetworkScanners/soat.py	2009-01-23 23:36:40 UTC (rev 18261)
+++ torflow/trunk/NetworkScanners/soat.py	2009-01-24 01:07:03 UTC (rev 18262)
@@ -38,6 +38,7 @@
 import urllib
 import urllib2
 import traceback
+import copy
 
 import libsoat 
 from libsoat import *
@@ -67,6 +68,19 @@
 allowed_filetypes = ['all','pdf']
 result_per_type = 5 
 
+firefox_headers = {
+    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0',
+    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    'Accept-Language':"en-us,en;q=0.5",
+    'Accept-Encoding':"gzip,deflate",
+    'Accept-Charset': "ISO-8859-1,utf-8;q=0.7,*;q=0.7",
+    'Keep-Alive':"300",
+    'Connection':"keep-alive"
+}
+
+# This will be set the first time we hit google if it is empty
+google_cookie=""
+
 #
 # ports to test in the consistency test
 #
@@ -1000,11 +1014,16 @@
         self.__control.set_event_handler(self.__dnshandler)
         self.__control.set_events([TorCtl.EVENT_TYPE.STREAM], True)
 
+    def _firefoxify(self, request):
+        # XXX: Fix user agent, add cookie support
+        for h in firefox_headers.iterkeys():
+            request.add_header(h, firefox_headers[h])
+        
+
     def http_request(self, address):
         ''' perform a http GET-request and return the content received '''
         request = urllib2.Request(address)
-        # XXX: Make all headers match a real firefox browser
-        request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0')
+        self._firefoxify(request)
 
         content = 0
         try:
@@ -1111,13 +1130,14 @@
             # search google for relevant pages
             # note: google only accepts requests from idenitified browsers
             # TODO gracefully handle the case when google doesn't want to give us result anymore
-            # XXX: Make more of these headers match? Maybe set a cookie.. or
-            # use scroogle :)
+            # XXX: Maybe set a cookie.. or use scroogle :)
             host = 'www.google.com'
             params = urllib.urlencode({'q' : query})
-            headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.8.1) Gecko/20061010 Firefox/2.0'}
             search_path = '/search' + '?' + params
-
+            headers = copy.copy(firefox_headers)
+            global google_cookie
+            if google_cookie:
+                headers["Cookie"] = google_cookie
             connection = None
             response = None
 
@@ -1127,6 +1147,11 @@
                 response = connection.getresponse()
                 if response.status != 200:
                     raise Exception(response.status, response.reason)
+                cookie = response.getheader("Cookie")
+                if cookie:
+                    plog("INFO", "Got google cookie: "+cookie)
+                    google_cookie=cookie
+                
             except socket.gaierror, e:
                 plog('ERROR', 'Scraping of http://'+host+search_path+" failed")
                 traceback.print_exc()