[Author Prev][Author Next][Thread Prev][Thread Next][Author Index][Thread Index]
[tor-commits] [ooni-probe/master] Make improvements to the classifier
commit 994860da43b2488de8a64d449ec2b01addb1ea4f
Author: Arturo Filastò <hellais@xxxxxxxxxxxxxx>
Date: Sun Aug 19 15:52:46 2012 -0700
Make improvements to the classifier
---
ooni/plugins/domclass.py | 54 ++++++++++++++++++++++++++++-----------------
1 files changed, 33 insertions(+), 21 deletions(-)
diff --git a/ooni/plugins/domclass.py b/ooni/plugins/domclass.py
index 5c9b6fb..31e2e41 100644
--- a/ooni/plugins/domclass.py
+++ b/ooni/plugins/domclass.py
@@ -17,6 +17,7 @@ class domclassArgs(usage.Options):
['fileb', 'b', None, 'Corpus file'],
['asset', 'a', None, 'URL List'],
['resume', 'r', 0, 'Resume at this index']]
+
alltags = ['A', 'ABBR', 'ACRONYM', 'ADDRESS', 'APPLET', 'AREA', 'B', 'BASE',
'BASEFONT', 'BD', 'BIG', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
'CENTER', 'CITE', 'CODE', 'COL', 'COLGROUP', 'DD', 'DEL', 'DFN', 'DIR', 'DIV',
@@ -38,12 +39,10 @@ commontags = ['A', 'B', 'BLOCKQUOTE', 'BODY', 'BR', 'BUTTON', 'CAPTION',
'STRIKE', 'STRONG', 'STYLE', 'SUB', 'SUP', 'TABLE', 'TBODY', 'TD',
'TEXTAREA', 'TFOOT', 'TH', 'THEAD', 'TITLE', 'TR', 'TT', 'U', 'UL']
-thetags = ['A',
- 'DIV',
- 'FRAME', 'H1', 'H2',
- 'H3', 'H4', 'IFRAME ', 'INPUT', 'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
- 'STYLE',
- 'TR']
+thetags = ['A', 'DIV', 'FRAME', 'H1', 'H2',
+ 'H3', 'H4', 'IFRAME ', 'INPUT',
+ 'LABEL','LI', 'P', 'SCRIPT', 'SPAN',
+ 'STYLE', 'TR']
def compute_matrix(dataset):
import itertools
@@ -65,15 +64,26 @@ def compute_matrix(dataset):
y = len(thetags)
matrix[x,y] += 1
+
+ for x in xrange(len(thetags) + 1):
+ possibilities = 0
+ for y in matrix[x]:
+ possibilities += y
+
+ for i in xrange(len(matrix[x])):
+ if possibilities != 0:
+ matrix[x][i] = matrix[x][i]/possibilities
+
ret['matrix'] = matrix
ret['eigen'] = numpy.linalg.eigvals(matrix)
return ret
-def readDOM(fn):
+def readDOM(content=None, filename=None):
from bs4 import BeautifulSoup
- #f = open(fn)
- #content = ''.join(f.readlines())
- content = fn
+ if filename:
+ f = open(filename)
+ content = ''.join(f.readlines())
+
dom = BeautifulSoup(content)
couples = []
for x in dom.findAll():
@@ -90,17 +100,19 @@ class domclassTest(HTTPTest):
options = domclassArgs
blocking = False
+ tool = True
+
def runTool(self):
import yaml, numpy
- site_a = readDOM(self.local_options['file'])
- site_b = readDOM(self.local_options['fileb'])
+ site_a = readDOM(filename=self.local_options['file'])
+ site_b = readDOM(filename=self.local_options['fileb'])
a = compute_matrix(site_a)
- self.result['eigenvalues'] = str(a['eigen'])
- self.result['matrix'] = str(a['matrix'])
- self.result['content'] = data[:200]
+ self.result['eigenvalues'] = a['eigen']
+ #self.result['matrix'] = str(a['matrix']
+ #self.result['content'] = data[:200]
b = compute_matrix(site_b)
- print "A: %s" % a
- print "B: %s" % b
+ #print "A: %s" % a
+ #print "B: %s" % b
correlation = numpy.vdot(a['eigen'],b['eigen'])
correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
correlation = (correlation + 1)/2
@@ -116,17 +128,17 @@ class domclassTest(HTTPTest):
self.result['eigenvalues'] = None
self.result['matrix'] = None
else:
- self.result['eigenvalues'] = str(a['eigen'])
- self.result['matrix'] = str(a['matrix'])
+ self.result['eigenvalues'] = a['eigen']
+ #self.result['matrix'] = str(a['matrix'])
#self.result['content'] = data[:200]
#b = compute_matrix(site_b)
print "A: %s" % a
- return a
+ return a['eigen']
#print "B: %s" % b
#correlation = numpy.vdot(a['eigen'],b['eigen'])
#correlation /= numpy.linalg.norm(a['eigen'])*numpy.linalg.norm(b['eigen'])
#correlation = (correlation + 1)/2
- # print "Corelation: %s" % correlation
+ #print "Corelation: %s" % correlation
# We need to instantiate it otherwise getPlugins does not detect it
# XXX Find a way to load plugins without instantiating them.
_______________________________________________
tor-commits mailing list
tor-commits@xxxxxxxxxxxxxxxxxxxx
https://lists.torproject.org/cgi-bin/mailman/listinfo/tor-commits