[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]

[tor-commits] [ooni-probe/master] Make improvements to the classifier



commit 994860da43b2488de8a64d449ec2b01addb1ea4f
Author: Arturo Filastò <hellais@xxxxxxxxxxxxxx>
Date:   Sun Aug 19 15:52:46 2012 -0700

    Make improvements to the classifier
---
 ooni/plugins/domclass.py |   54 ++++++++++++++++++++++++++++-----------------
 1 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py
index 5c9b6fb..31e2e41 100644
--- a/ooni/plugins/domclass.py
+++ b/ooni/plugins/domclass.py
@@ -17,6 +17,7 @@ class domclassArgs(usage.Options):
                      ['fileb', 'b', None, 'Corpus file'],
                      ['asset', 'a', None, 'URL List'],
                      ['resume', 'r', 0, 'Resume at this index']]
+
 alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
            'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
            'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
@@ -38,12 +39,10 @@ commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
            'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
            'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
 
-thetags = ['A',
-           'DIV',
-           'FRAME', 'H1', 'H2',
-           'H3', 'H4', 'IFRAME ', 'INPUT', 'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
-           'STYLE',
-           'TR']
+thetags = ['A', 'DIV', 'FRAME', 'H1', 'H2',
+           'H3', 'H4', 'IFRAME ', 'INPUT',
+           'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
+           'STYLE', 'TR']
 
 def compute_matrix(dataset):
     import itertools
@@ -65,15 +64,26 @@ def compute_matrix(dataset):
             y = len(thetags)
 
         matrix[x,y] += 1
+
+    for x in xrange(len(thetags) + 1):
+        possibilities = 0
+        for y in matrix[x]:
+            possibilities += y
+
+        for i in xrange(len(matrix[x])):
+            if possibilities != 0:
+                matrix[x][i] = matrix[x][i]/possibilities
+
     ret['matrix'] = matrix
     ret['eigen'] = numpy.linalg.eigvals(matrix)
     return ret
 
-def readDOM(fn):
+def readDOM(content=None, filename=None):
     from bs4 import BeautifulSoup
-    #f = open(fn)
-    #content = ''.join(f.readlines())
-    content = fn
+    if filename:
+        f = open(filename)
+        content = ''.join(f.readlines())
+
     dom = BeautifulSoup(content)
     couples = []
     for x in dom.findAll():
@@ -90,17 +100,19 @@ class domclassTest(HTTPTest):
     options = domclassArgs
     blocking = False
 
+    tool = True
+
     def runTool(self):
         import yaml, numpy
-        site_a = readDOM(self.local_options['file'])
-        site_b = readDOM(self.local_options['fileb'])
+        site_a = readDOM(filename=self.local_options['file'])
+        site_b = readDOM(filename=self.local_options['fileb'])
         a = compute_matrix(site_a)
-        self.result['eigenvalues'] = str(a['eigen'])
-        self.result['matrix'] = str(a['matrix'])
-        self.result['content'] = data[:200]
+        self.result['eigenvalues'] = a['eigen']
+        #self.result['matrix'] = str(a['matrix']
+        #self.result['content'] = data[:200]
         b = compute_matrix(site_b)
-        print "A: %s" % a
-        print "B: %s" % b
+        #print "A: %s" % a
+        #print "B: %s" % b
         correlation = numpy.vdot(a['eigen'],b['eigen'])
         correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
         correlation = (correlation + 1)/2
@@ -116,17 +128,17 @@ class domclassTest(HTTPTest):
             self.result['eigenvalues'] = None
             self.result['matrix'] = None
         else:
-            self.result['eigenvalues'] = str(a['eigen'])
-            self.result['matrix'] = str(a['matrix'])
+            self.result['eigenvalues'] = a['eigen']
+            #self.result['matrix'] = str(a['matrix'])
         #self.result['content'] = data[:200]
         #b = compute_matrix(site_b)
         print "A: %s" % a
-        return a
+        return a['eigen']
         #print "B: %s" % b
         #correlation = numpy.vdot(a['eigen'],b['eigen'])
         #correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
         #correlation = (correlation + 1)/2
-        # print "Corelation: %s" % correlation
+        #print "Corelation: %s" % correlation
 
 # We need to instantiate it otherwise getPlugins does not detect it
 # XXX Find a way to load plugins without instantiating them.



_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits